def __init__(self, name, observation_shape, hid_size, num_hid_layers): with tf.variable_scope(name): self.scope = tf.get_variable_scope().name observations_ph = U.get_placeholder(name='ob', dtype=tf.float32, shape=[None] + list(observation_shape)) with tf.variable_scope('obfilter'): self.ob_rms = RunningMeanStd(shape=observation_shape) with tf.variable_scope('vf'): last_out = tf.clip_by_value( (observations_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] self.predict = U.function([observations_ph], self.vpred)
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def build_graph(self, obs_ph, acs_ph, reuse=False): with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd(shape=self.observation_shape) obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std) last_out = obs last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv1', (7, 7), (3, 3), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv2', (5, 5), (2, 2), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv3', (3, 3), (1, 1), pad='VALID')) last_out = tf.nn.tanh( U.conv2d(last_out, 64, 'vfconv4', (3, 3), (1, 1), pad='VALID')) last_out = tf.reshape(last_out, tf.convert_to_tensor([-1, 784 * 4])) last_out = tf.nn.tanh( tf.layers.dense(last_out, 512, kernel_initializer=U.normc_initializer(1.0))) last_out = tf.concat([last_out, acs_ph], axis=1) logits = tf.layers.dense( last_out + self.num_actions, 1, kernel_initializer=U.normc_initializer(1.0)) return logits
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 #X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def _init(self, ob_space, ac_space): self.pdtype = distributions.make_pdtype(ac_space) ob = U.get_placeholder(name='ob', dtype=tf.int32, shape=[None] + list(ob_space.shape)) next_blocks, my_grid, opp_grid = tf.split(ob, [16, 12 * 6, 12 * 6], axis=1) with tf.variable_scope('next_blocks'): next_blocks = tf.one_hot(next_blocks, depth=5) next_blocks = U.flattenallbut0(next_blocks) next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l1', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l2', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) with tf.variable_scope('grids', reuse=False): my_grid = _grid_cnn(my_grid) with tf.variable_scope('grids', reuse=True): opp_grid = _grid_cnn(opp_grid) x = tf.concat([next_blocks, my_grid, opp_grid], axis=1) x = tf.nn.leaky_relu(tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) logits = tf.layers.dense(x, self.pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = self.pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def _create_logit_value(self, action_layer, value_layer, gaussian_fixed_var=False): # actor if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box): mean = U.dense(action_layer, self.pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(action_layer, self.pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = self.pdtype.pdfromflat(pdparam) self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) # critic self.vpred = U.dense(value_layer, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
def _create_network(self): x = self.ob # create ob filter if self.ob_filter: self.ob_rms = RunningMeanStd(shape=self.ob_space.shape) x = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # actor l = x l = tf.nn.tanh( U.dense(l, 32, "a_1", weight_init=U.normc_initializer(1.0))) l = tf.nn.tanh( U.dense(l, 32, "a_2", weight_init=U.normc_initializer(1.0))) action_layer = l # critic l = x l = tf.nn.tanh( U.dense(l, 32, "c_1", weight_init=U.normc_initializer(1.0))) l = tf.nn.tanh( U.dense(l, 32, "c_2", weight_init=U.normc_initializer(1.0))) value_layer = l self._create_logit_value(action_layer, value_layer, self.gaussian_fixed_var)
def _init(self, in_dim, out_dim, hid_size, num_hid_layers, last_init_size=0.01, name='ff'): # state_dim: dimension of input/output state from previous/root encoder self.params = [] self.num_hid_layers = num_hid_layers self.intin_dim = in_dim last_out_dim = in_dim for i in range(num_hid_layers): w, b = dense_params(last_out_dim, hid_size, name + "%i" % (i + 1), weight_init=U.normc_initializer(1.0)) self.params.append([w, b]) last_out_dim = hid_size w, b = dense_params(last_out_dim, out_dim, name + "_out", weight_init=U.normc_initializer(last_init_size)) self.params.append([w, b])
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape) #obs print(ob_shape) self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): ''' h = conv(X, 'c1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h2 = conv(h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h3 = conv(h2, 'c3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) hh = conv(X, 'xc1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh2 = conv(hh, 'xc2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh3 = conv(hh2, 'xc3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh3 = conv_to_fc(hh3) hh4 = fc(hh3, 'xfc1', nh=512, init_scale=np.sqrt(2)) pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01) vf = fc(hh4, 'v', 1, act=lambda x:x)[:,0] ''' x = tf.nn.relu(U.conv2d(X, 32, "l1", [3, 3], [1, 1], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [3, 3], [1, 1], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 128, "l3", [3, 3], [1, 1], pad="SAME")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) y = tf.nn.relu(U.conv2d(X, 32, "yl1", [3, 3], [1, 1], pad="SAME")) y = tf.nn.relu(U.conv2d(y, 64, "yl2", [3, 3], [1, 1], pad="SAME")) y = tf.nn.relu(U.conv2d(y, 128, "yl3", [3, 3], [1, 1], pad="SAME")) y = U.flattenallbut0(y) y = tf.nn.relu(U.dense(y, 512, 'ylin', U.normc_initializer(1.0))) pi = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) vf = U.dense(y, 1, "value", U.normc_initializer(1.0))[:, 0] self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def _init(self, in_dim, out_dim, hid_size, num_hid_layers, last_init_size=0.01): # state_dim: dimension of input/output state from previous/root encoder self.params = [] self.num_hid_layers = num_hid_layers self.intin_dim = in_dim - 1 last_out_dim = in_dim - 1 for i in range(num_hid_layers): w, b = dense_params(last_out_dim, hid_size, "ff%i" % (i + 1), weight_init=U.normc_initializer(1.0)) logmask = tf.get_variable( name="logmask%i" % (i + 1), shape=[1], initializer=tf.constant_initializer(-1.0)) self.params.append([w, b, logmask]) last_out_dim = hid_size w, b = dense_params(last_out_dim, out_dim, "ff_out", weight_init=U.normc_initializer(last_init_size)) logmask = tf.get_variable(name="logmask_out", shape=[1], initializer=tf.constant_initializer(-1.0)) self.params.append([w, b, logmask])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, exploration_rate, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # with tf.variable_scope("obfilter"): # self.ob_rms = RunningMeanStd(shape=ob_space.shape) # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob valueFunction = Sequential() valueFunction.add(InputLayer(input_tensor=obz)) valueFunction.add(Dense(64, activation='tanh')) valueFunction.add(Dense(64, activation='tanh')) self.vpred = self.dense(x=valueFunction.output, size=1, name="vffinal", weight_init=U.normc_initializer(1.0), bias=True)[:, 0] model = Sequential() model.add(InputLayer(input_tensor=obz)) model.add(Dense(64, activation='tanh')) model.add(Dense(64, activation='tanh')) model.add(Dense(23)) model.load_weights("neural_kick") if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = model.output logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(exploration_rate)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(model.output, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1) my_var_out = tf.identity(my_var, name='output_node') self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def img_encoder(self, x, kind): if kind == 'small': # from A3C paper x = max_pool( tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [1, 1], pad="VALID")), 4) x = max_pool( tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [1, 1], pad="VALID")), 2) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = max_pool( tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [1, 1], pad="VALID")), 4) x = max_pool( tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [1, 1], pad="VALID")), 2) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError return x
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, use_actions): assert isinstance(ob_space, gym.spaces.Box) self.use_actions = use_actions sequence_length = None if use_actions: inp_shape = (ob_space.shape[0] + ac_space.shape[0], ) else: inp_shape = ob_space.shape rew_input = U.get_placeholder(name="rew_input", dtype=tf.float32, shape=[sequence_length] + list(inp_shape)) with tf.variable_scope("inputfilter"): self.inp_rms = RunningMeanStd(shape=inp_shape) with tf.variable_scope('rew'): input_clipped = tf.clip_by_value( (rew_input - self.inp_rms.mean) / self.inp_rms.std, -5.0, 5.0) last_out = input_clipped for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.reward = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] self._rew = U.function([rew_input], [self.reward])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def build_graph(self, obs_ph, acs_ph, reuse=False): """ obs_ph: tf tensor shape of [None,84,84,4] acs_ph: tf tensor shape of [None,ac_dim] #one hot encoding """ with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() one_hot_ac = tf.one_hot(acs_ph, self.num_actions, dtype=tf.float32) x = tf.concat([ obs_ph / 255.0, tf.tile(one_hot_ac[:, None, None, :], [1, 84, 84, 1]) ], axis=3) #[None,84,84,4+ac_dim] x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.tanh( tf.layers.dense(x, 512, name='lin1', kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense( x, 1, name='lin2', kernel_initializer=U.normc_initializer(1.0)) return logits
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True): #assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # Apply rnn_to reduce history with tf.variable_scope("vf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # Apply rnn_to reduce history with tf.variable_scope("pf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0)) assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box) mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, tau, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) print('use zpmpl_Adv') self.ac_space = ac_space self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None self.ob = U.get_placeholder(name="ob_adv", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.ob_ = U.get_placeholder(name="adv_ob_", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter_adv"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('adv_vf'): self.obz = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = self.obz for i in range(self.num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, self.hid_size, name="adv_vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name="adv_vffinal", kernel_initializer=U.normc_initializer(1.0))[:, 0] self.pdparam = self.build_action(self.ob) self.pdparam_ = self.build_action(self.ob_, reuse=True) self.pd = pdtype.pdfromflat(self.pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = self.pd.sample() self.ac_, _ = self.sample_() self._act = U.function([stochastic, self.ob], [ac, self.vpred])
def __init__(self, ob_dim, ac_dim, ac_space, bins): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations oldac_na = tf.placeholder( tf.int32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_logits = tf.placeholder( tf.float32, shape=[None, ac_dim * bins], name="oldac_logit" ) # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate self.pdtype = make_pdtype(ac_space) wd_dict = {} # forward pass h1 = tf.nn.tanh( dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh( dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) logits_na = dense(h2, self.pdtype.param_shape()[0], "logits", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control self.wd_dict = wd_dict self.pd = self.pdtype.pdfromflat( logits_na) # multi-categorical distributions # sample action for control sampled_ac_na = self.pd.sample() # log prob for sampled actions logprobsampled_n = -self.pd.neglogp(sampled_ac_na) logprob_n = -self.pd.neglogp(oldac_na) # kl div old_pd = self.pdtype.pdfromflat(oldac_logits) kl = U.mean(old_pd.kl(self.pd)) # surr loss surr = -U.mean(adv_n * logprob_n) surr_sampled = -U.mean(logprob_n) # expressions self._act = U.function([ob_no], [sampled_ac_na, logits_na, logprobsampled_n]) self.compute_kl = U.function([ob_no, oldac_logits], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) U.initialize()
def resnet(inputs, hid_size, name): x = U.dense(inputs, hid_size, "%s_dense1"%name, weight_init=U.normc_initializer(1.0)) #x = tf.contrib.layers.batch_norm(x) x = tf.nn.relu(x) x = U.dense(x, hid_size, "%s_dense2"%name, weight_init=U.normc_initializer(1.0)) #x = tf.contrib.layers.batch_norm(x) x = tf.nn.relu(x+inputs) return x
def __init__(self, sess, ob_dim, ac_dim, vf_lr=0.001, cv_lr=0.001, reuse=False): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them self.relaxed = False self.X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations self.ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations self.oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions with tf.variable_scope("model", reuse=reuse): h1 = tf.nn.tanh(dense(self.ob_no, 64, "pi_h1", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h2 = tf.nn.tanh(dense(h1, 64, "pi_h2", weight_init=U.normc_initializer(1.0), bias_init=0.0)) mean_na = dense(h2, ac_dim, "pi", weight_init=U.normc_initializer(0.1), bias_init=0.0) # Mean control output self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) self.std_1a = tf.exp(logstd_1a) self.std_na = tf.tile(self.std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(self.std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action self.logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - self.oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) vh1 = tf.nn.elu(dense(self.X, 64, "vf_h1", weight_init=U.normc_initializer(1.0), bias_init=0)) vh2 = tf.nn.elu(dense(vh1, 64, "vf_h2", weight_init=U.normc_initializer(1.0), bias_init=0)) vpred_n = dense(vh2, 1, "vf", weight_init=None, bias_init=0) v0 = vpred_n[:, 0] self.vf_optim = tf.train.AdamOptimizer(vf_lr) def act(ob): ac, dist, logp = sess.run([sampled_ac_na, ac_dist, logprobsampled_n], {self.ob_no: ob[None]}) # Generate a new action and its logprob return ac[0], dist[0], logp[0] def value(obs, x): return sess.run(v0, {self.X: x, self.ob_no:obs}) def preproc(path): l = pathlength(path) al = np.arange(l).reshape(-1,1)/10.0 act = path["action_dist"].astype('float32') X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1) return X def predict(obs, path): return value(obs, preproc(path)) def compute_kl(ob, dist): return sess.run(kl, {self.ob_no: ob, oldac_dist: dist}) self.mean = mean_na self.vf = v0 self.act = act self.value = value self.preproc = preproc self.predict = predict self.compute_kl = compute_kl self.a0 = sampled_ac_na
def build_forward(self, state, reuse): # build noise samples batch_size = [state.get_shape().as_list()[0], self.input_dim] noise_dist = tfd.Normal(loc=0., scale=1.) noise_samples = noise_dist.sample( batch_size) # size of [batchsize, action dim] # build forward last_out = state self.meandict = meandict = [] self.logstddict = logstddict = [] with tf.variable_scope('forward', reuse=reuse): for i in range(self.num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, self.hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) for k in range(self.K): mean = U.dense(last_out, self.input_dim, "polfinal_{}".format(k), U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd_{}".format(k), shape=[1, self.input_dim], initializer=tf.zeros_initializer()) meandict.append(mean) logstddict.append(logstd) meandicttf = tf.concat(meandict, axis=1) # size of [batchsize, action dim * K] logstddicttf = tf.concat(logstddict, axis=1) # generate masks logits = [0.0] * self.K num_samples = self.state.shape.as_list()[0] categorical_mask = tf.multinomial([logits], num_samples) #print('categoricalmask', categorical_mask) onehot_mask = tf.squeeze(tf.one_hot(categorical_mask, self.K), 0) #print('onehotmask', onehot_mask) onehot_mask_tiled = tf.squeeze(tf.reshape( tf.tile(tf.expand_dims(onehot_mask, axis=2), [1, 1, self.input_dim]), [-1, self.input_dim * self.K, 1]), axis=2) # select mean_tiled = tf.multiply( onehot_mask_tiled, meandicttf) # size of [batchsize, action dim * K] logstd_tiled = tf.multiply(onehot_mask_tiled, logstddicttf) # sample action mean and logstd mean = tf.reshape( mean_tiled, [-1, self.K, self.input_dim]) # size of [batchsize, K, action dim] logstd = tf.reshape(logstd_tiled, [-1, self.K, self.input_dim]) mean_final = tf.reduce_sum( mean, axis=1, keepdims=True) # size of [batchsize, action dim] logstd_final = tf.reduce_sum(logstd, axis=1, keepdims=True) # sample action action = tf.exp(logstd_final) * noise_samples + mean_final self.y_sample = action
def build_network(self, sess, scope, ob): with tf.variable_scope(scope + "/obfilter"): ob_rms = RunningMeanStd(shape=self.ob_space.shape) with tf.variable_scope(scope + '/vf'): obz = tf.clip_by_value((ob - ob_rms.mean) / ob_rms.std, -5.0, 5.0) last_out = obz for i in range(self.num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, self.hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope(scope + '/pol'): last_out = obz ############## tf layers version ############# for i in range(self.num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, self.hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = tf.layers.dense( last_out, self.ac_dim, name='final', kernel_initializer=U.normc_initializer(0.01)) # ############## tf learn version ############# # for i in range(self.num_hid_layers): # last_out = tflearn.fully_connected(last_out, self.hid_size, name='fc%i'%(i+1), activation='tanh') # mean = tflearn.fully_connected(last_out, self.ac_dim, name='final', activation='tanh') logstd = tf.get_variable( name="logstd", shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) pd = self.pdtype.pdfromflat(pdparam) sample_ac = pd.sample() ac_mean = pd.mode() return ob_rms, vpred, pd, sample_ac, ac_mean
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, num_units=3, num_layers=4): assert isinstance(ob_space, gym.spaces.Box) nbatch_train = 1024 nbatch_vf_train = 64 nbatch_fvp_train = 205 # sub-sampled size self.ob_train = ob_train = U.get_placeholder(name="ob_train", dtype=tf.float32, shape=[nbatch_train] + list(ob_space.shape)) self.action_train = action_train = U.get_placeholder(name='ac_train', dtype=tf.float32, shape=[nbatch_train] + list(ac_space.shape)) ob_act = U.get_placeholder(name="ob_act", dtype=tf.float32, shape=[1] + list(ob_space.shape)) action_act = U.get_placeholder(name='ac_act', dtype=tf.float32, shape=[1] + list(ac_space.shape)) self.ob_vf_train = ob_vf_train = U.get_placeholder(name="ob_vf_train", dtype=tf.float32, shape=[nbatch_vf_train] + list(ob_space.shape)) self.ob_fvp_train = ob_fvp_train = U.get_placeholder(name="ob_fvp_train", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ob_space.shape)) self.ac_fvp_train = action_fvp_train = U.get_placeholder(name="ac_fvp_act", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ac_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz_train = tf.clip_by_value((ob_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_act = tf.clip_by_value((ob_act - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_vf_train = tf.clip_by_value((ob_vf_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_fvp_train = tf.clip_by_value((ob_fvp_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # value function last_out = obz_vf_train with tf.variable_scope('value', reuse=False): for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred_train = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz_act with tf.variable_scope('value', reuse=True): for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred_act = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # policy policy_train = NormalizingFlowStateModel(obz_train, action_train, name='policy', reuse=False, num_units=num_units, num_layers=num_layers) policy_act = NormalizingFlowStateModel(obz_act, action_act, name='policy', reuse=True, num_units=num_units, num_layers=num_layers) policy_fvp_train = NormalizingFlowStateModel(obz_fvp_train, action_fvp_train, name='policy', reuse=True, num_units=num_units, num_layers=num_layers) self.pi_act = policy_act.y_sample #act for forward sampling self.pi_train = policy_fvp_train.y_sample #for fvp self.entropy_train = policy_train.entropy self.log_prob_act = policy_act.log_prob self.action_act = action_act self.log_prob_train = policy_train.log_prob #logprob self.log_prob_fvp_train = policy_fvp_train.log_prob self.state_in = [] self.state_out = [] #stochastic = tf.placeholder(dtype=tf.bool, shape=()) #ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #self._act = U.function([stochastic, ob], [ac, self.vpred]) self._act = U.function([ob_act], [self.pi_act, self.vpred_act]) self.ob_act = ob_act
def lstm_graph(ob_combined, input_state_combined, env): # parse action distribution mean, logstd # get action space type pdtype = make_pdtype(env.action_space) # new cell cell = tf.contrib.rnn.LSTMCell(num_units=NUM_UNITS, name="lol") # Initailize state with zero of batch size 1 and type float32 #c_state, m_state = tf.split(input_state_combined, [1, 1], 0 ) c_state, m_state = input_state_combined[0, :, :], input_state_combined[ 1, :, :] state = tf.tuple([c_state, m_state]) s_mean_list, s_std_list, s_logstd_list = [], [], [] for i in range(STEPS_UNROLLED): if i > 0: tf.get_variable_scope().reuse_variables() # normalize observation vector with rms rms = RunningMeanStd(shape=env.observation_space.shape) # only first step; the rest will need all (all batch, all observation space dim ) obz = tf.clip_by_value((ob_combined[i, :, :] - rms.mean) / rms.std, -5.0, 5.0) output, state = cell(obz, state) output = tf.nn.tanh( tf.layers.dense(output, 64, name='last', kernel_initializer=U.normc_initializer(1.0))) # feed the output of lstm to a final FC layer # this 'flat' vector will be split into the mean and std of a pd pdparam = tf.layers.dense(output, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) pd = pdtype.pdfromflat(pdparam) s_mean_list.append(pd.mean) s_std_list.append(pd.std) s_logstd_list.append(pd.logstd) # stack the outputs at each cell together so that we can conveniently compute loss and etc s_mean_combined = tf.stack(s_mean_list) s_std_combined = tf.stack(s_std_list) s_logstd_combined = tf.stack(s_logstd_list) final_state_combined = tf.stack(state) return s_mean_combined, s_std_combined, s_logstd_combined, final_state_combined
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind): print type(ob_space) assert isinstance(ob_space, gym.spaces.box.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.ob = [ob] #process ob_ x = ob / 255.0 ob_last = self.img_encoder(x, kind) with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense( last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, name, observation_shape, action_shape, hid_size, num_hid_layers, stochastic=True): with tf.variable_scope(name): self.stochastic = stochastic self.hid_size, self.num_hid_layers = hid_size, num_hid_layers self.action_shape, self.observation_shape = action_shape, observation_shape self.scope = tf.get_variable_scope().name self.pdtype = DiagGaussianPdType(action_shape[0]) observations_ph = U.get_placeholder(name='ob', dtype=tf.float32, shape=[None] + list(observation_shape)) stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) with tf.variable_scope('obfilter'): self.ob_rms = RunningMeanStd(shape=observation_shape) with tf.variable_scope('pol'): last_out = tf.clip_by_value( (observations_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = tf.layers.dense( last_out, self.pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name='logstd', shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) action_op = U.switch(stochastic_ph, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic_ph, observations_ph], action_op)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0] ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape]) if activation == 'tanh': activ = tf.nn.tanh elif activation == 'elu': activ = tf.nn.elu elif activation == 'lrelu': activ = lambda x: tf.maximum(x, 0.01 * x) else: raise NotImplementedError("Not available activation: " + activation) if PREPROCESS: last_out = ob else: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1)) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], normalizer: GaussianNormalizer, init_std=1.): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes self.init_std = init_std self.normalizer = normalizer with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states') self.op_actions_ = tf.placeholder(tf.float32, shape=[None, dim_action], name='actions') layers = [] # note that the placeholder has size 105. all_sizes = [dim_state, *self.hidden_sizes] for i, (in_features, out_features) in enumerate( zip(all_sizes[:-1], all_sizes[1:])): layers.append( nn.Linear(in_features, out_features, weight_initializer=normc_initializer(1))) layers.append(nn.Tanh()) layers.append( nn.Linear(all_sizes[-1], dim_action, weight_initializer=normc_initializer(0.01))) self.net = nn.Sequential(*layers) self.op_log_std = nn.Parameter(tf.constant(np.log(self.init_std), shape=[self.dim_action], dtype=tf.float32), name='log_std') self.distribution = self(self.op_states) self.op_actions = self.distribution.sample() self.op_actions_mean = self.distribution.mean() self.op_actions_std = self.distribution.stddev() self.op_nlls_ = -self.distribution.log_prob( self.op_actions_).reduce_sum(axis=1) self.register_callable('[states] => [actions]', self.fast)
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables