def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 #X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def __init__(self, ob_dim, ac_dim, ac_space, bins): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations oldac_na = tf.placeholder( tf.int32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_logits = tf.placeholder( tf.float32, shape=[None, ac_dim * bins], name="oldac_logit" ) # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate self.pdtype = make_pdtype(ac_space) wd_dict = {} # forward pass h1 = tf.nn.tanh( dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh( dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) logits_na = dense(h2, self.pdtype.param_shape()[0], "logits", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control self.wd_dict = wd_dict self.pd = self.pdtype.pdfromflat( logits_na) # multi-categorical distributions # sample action for control sampled_ac_na = self.pd.sample() # log prob for sampled actions logprobsampled_n = -self.pd.neglogp(sampled_ac_na) logprob_n = -self.pd.neglogp(oldac_na) # kl div old_pd = self.pdtype.pdfromflat(oldac_logits) kl = U.mean(old_pd.kl(self.pd)) # surr loss surr = -U.mean(adv_n * logprob_n) surr_sampled = -U.mean(logprob_n) # expressions self._act = U.function([ob_no], [sampled_ac_na, logits_na, logprobsampled_n]) self.compute_kl = U.function([ob_no, oldac_logits], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) U.initialize()
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="g_ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) # self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="pol_logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac])
def modeling(input_ob): for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(input_ob, hid_size, "fc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "final", U.normc_initializer(0.01)) logstd = tf.get_variable( name='med_logstd', shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(input_ob, pdtype.param_shape()[0], "final", U.normc_initializer(0.01)) return pdparam
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, sess, ob_dim, ac_dim, vf_lr=0.001, cv_lr=0.001, reuse=False): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them self.relaxed = False self.X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations self.ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations self.oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions with tf.variable_scope("model", reuse=reuse): h1 = tf.nn.tanh(dense(self.ob_no, 64, "pi_h1", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h2 = tf.nn.tanh(dense(h1, 64, "pi_h2", weight_init=U.normc_initializer(1.0), bias_init=0.0)) mean_na = dense(h2, ac_dim, "pi", weight_init=U.normc_initializer(0.1), bias_init=0.0) # Mean control output self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) self.std_1a = tf.exp(logstd_1a) self.std_na = tf.tile(self.std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(self.std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action self.logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - self.oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) vh1 = tf.nn.elu(dense(self.X, 64, "vf_h1", weight_init=U.normc_initializer(1.0), bias_init=0)) vh2 = tf.nn.elu(dense(vh1, 64, "vf_h2", weight_init=U.normc_initializer(1.0), bias_init=0)) vpred_n = dense(vh2, 1, "vf", weight_init=None, bias_init=0) v0 = vpred_n[:, 0] self.vf_optim = tf.train.AdamOptimizer(vf_lr) def act(ob): ac, dist, logp = sess.run([sampled_ac_na, ac_dist, logprobsampled_n], {self.ob_no: ob[None]}) # Generate a new action and its logprob return ac[0], dist[0], logp[0] def value(obs, x): return sess.run(v0, {self.X: x, self.ob_no:obs}) def preproc(path): l = pathlength(path) al = np.arange(l).reshape(-1,1)/10.0 act = path["action_dist"].astype('float32') X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1) return X def predict(obs, path): return value(obs, preproc(path)) def compute_kl(ob, dist): return sess.run(kl, {self.ob_no: ob, oldac_dist: dist}) self.mean = mean_na self.vf = v0 self.act = act self.value = value self.preproc = preproc self.predict = predict self.compute_kl = compute_kl self.a0 = sampled_ac_na
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, **kwargs): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) from hr_coordination.pbt.pbt_utils import conv_network_fn last_out = conv_network_fn(**kwargs)(ob) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] self.logits = tf.layers.dense(last_out, 6, activation=None) probs = tf.nn.softmax(self.logits, axis=1) action_mode = tf.argmax(probs, axis=1) action_dist = tf.distributions.Categorical(probs=probs) action_sampled = action_dist.sample() self.pd = action_dist self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = action_sampled self.ac = action_sampled self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, l2_lambda, ac_space, hid_size, num_hid_layers): #assert isinstance(ob_space, gym.spaces.Box) self.l2_reg = l2_lambda self.pdtype = pdtype = make_pdtype(ac_space) #probablity type. sequence_length = None self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = self.ob #no-normalization last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if isinstance(ac_space, gym.spaces.Box ): #double check if the diag is changed during training mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat( pdparam) #a probability distribution, parameterized by pdparam self.state_in = [] self.state_out = [] stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) #stocahstic sample action or deterministically pick mode ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, self.ob], ac) #return the action and the #compute logp: #if isinstance(ac_space, gym.spaces.Box): # self.tmp_ac = tf.placeholder(tf.float32, shape = [sequence_length]+list(ac_space.shape)) #else: self.tmp_ac = tf.placeholder(tf.uint8, shape=[sequence_length] + list(ac_space.shape)) self._tf_logp = self.pd.logp(self.tmp_ac) self._logp = U.function([self.tmp_ac, self.ob], self._tf_logp)
def __init__(self, ob_dim, ac_dim): """ Create an MLP policy for a value function :param ob_dim: (int) Observation dimention :param ac_dim: (int) action dimention """ obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2 ]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} layer_1 = tf.nn.elu( dense(obs_ph, 64, "h1", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) layer_2 = tf.nn.elu( dense(layer_1, 64, "h2", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(layer_2, 1, "hfinal", weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:, 0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = tf.reduce_mean( tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = tf_util.function([obs_ph], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9, clip_kl=0.3, epsilon=0.1, stats_decay=0.95, async=1, kfac_update=2, cold_iter=50, weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = tf_util.function([obs_ph, vtarg_n], update_op) # pylint: disable=E1101 tf_util.initialize() # Initialize uninitialized TF variables
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Dict) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_config = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['joint'].shape)) ob_target = U.get_placeholder(name="goal", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['target'].shape)) obs_pos = U.get_placeholder( name="obs_pos", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['obstacle_pos1'].shape)) #is_training = U.get_placeholder(name="bn_training", dtype=tf.bool, shape=()) # construct v function model '''with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space['joint'].shape) obz = tf.clip_by_value((ob_config - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz goal_last_out = tf.clip_by_value((ob_target - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)''' last_out = ob_config goal_last_out = ob_target obs_last_out = obs_pos for i in range(num_hid_layers): last_out = dense(last_out, hid_size, "vfcfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="vfcbn%i"%(i+1)) last_out = tf.nn.tanh(last_out) goal_last_out = dense(goal_last_out, hid_size, "vfgfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="vfgbn%i" % (i + 1)) goal_last_out = tf.nn.tanh(goal_last_out) obs_last_out = dense(obs_last_out, hid_size, "vfobsfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="vfobn%i"%(i+1)) obs_last_out = tf.nn.tanh(obs_last_out) vpred = tf.concat([last_out, goal_last_out, obs_last_out], -1) self.vpred = dense(vpred, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # construct policy probability distribution model last_out = ob_config goal_last_out = ob_target obs_last_out = obs_pos for i in range(num_hid_layers): last_out = dense(last_out, hid_size, "pol_cfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="pol_cbn%i"%(i+1)) last_out = tf.nn.tanh(last_out) goal_last_out = dense(goal_last_out, hid_size, "pol_gfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="pol_gbn%i" % (i + 1)) goal_last_out = tf.nn.tanh(goal_last_out) obs_last_out = dense(obs_last_out, hid_size, "pol_obsfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="pol_obn%i"%(i+1)) obs_last_out = tf.nn.tanh(obs_last_out) last_out = tf.concat([last_out, goal_last_out, obs_last_out], -1) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer( [0.2, 0.2, -1., -1.])) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob_config, ob_target, obs_pos], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=False, popart=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("popart"): self.v_rms = RunningMeanStd(shape=[1]) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.norm_vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] if popart: self.vpred = denormalize(self.norm_vpred, self.v_rms) else: self.vpred = self.norm_vpred last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.mean_and_logstd = U.function([ob], [self.pd.mean, self.pd.logstd]) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred]) self.use_popart = popart if popart: self.init_popart() ret = tf.placeholder(tf.float32, [None]) vferr = tf.reduce_mean(tf.square(self.vpred - ret)) self.vlossandgrad = U.function([ob, ret], U.flatgrad(vferr, self.get_vf_variable()))
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=tf_util.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=tf_util.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01)) self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam) self.state_in = [] self.state_out = [] # change for BC self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode()) self.action = action self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])