def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a, b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l, g = lossandgrad() adam.update(g, stepsize) print(i, l)
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdclass()(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = U.eval(pd.sample(), feed_dict={M: Mval}) logliks = calcloglik(Xval, Mval) entval_ll = -logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdclass()(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = -entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, num_subpolicies, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var self.num_subpolicies = num_subpolicies with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # master policy last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "masterpol%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.selector = U.dense(last_out, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])
def __init__(self, env, master_policy,old_master_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64): self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.num_subpolicies = len(sub_policies) self.sub_policies = sub_policies self.master_policy = master_policy ob_space = env.observation_space ac_space = env.action_space self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None]) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # for training theta # inputs for training theta ob = U.get_placeholder_cached(name="ob") ob_master = U.get_placeholder_cached(name="adv_ob") ac_master = master_policy.pdtype.sample_placeholder([None]) loss_master = self.policy_loss_master(master_policy, old_master_policy, ob_master, ac_master, atarg, ret, clip_param) self.master_policy_var_list = master_policy.get_trainable_variables() self.master_loss = U.function([ob_master, ac_master, atarg, ret], U.flatgrad(loss_master, self.master_policy_var_list)) self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm) self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_master_policy.get_variables(), master_policy.get_variables())]) self.assign_subs = [] self.change_subs = [] self.adams = [] self.losses = [] for i in range(self.num_subpolicies): varlist = sub_policies[i].get_trainable_variables() self.adams.append(MpiAdam(varlist)) # loss for test loss = self.policy_loss(sub_policies[i], sub_policies[(i-1)%2], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param) self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist))) self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())])) self.zerograd = U.function([], self.nograd(varlist)) U.initialize() self.master_adam.sync() for i in range(self.num_subpolicies): self.adams[i].sync()
def __init__(self, name, ob, ac_space, network='mlp', gaussian_fixed_var=True, nsteps=None, nbatch=None, nlstm=256, states=None, masks=None, reuse=False): self.network = network shape = [] for d in range(1, len(ob.shape)): shape.append(ob.shape[d]) with tf.variable_scope(name, reuse=reuse): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=shape) obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) if network == 'mlp': hid_size = 64 num_hid_layers = 2 self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var self._mlp(obs, hid_size, num_hid_layers, ac_space, gaussian_fixed_var) elif network == 'cnn': self._cnn(obs, ac_space, gaussian_fixed_var) elif network == 'lstm': assert nsteps is not None and nbatch is not None assert states is not None and masks is not None assert isinstance(nsteps, int) and isinstance(nbatch, int) assert nsteps > 0 and nbatch > 0 self._lstm(obs, states, masks, nlstm, ac_space, nbatch, nsteps) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) if network == 'mlp' or network == 'cnn': self._act = U.function([stochastic, ob], [ac, self.vpred]) elif network == 'lstm': self._act = U.function([stochastic, ob, states, masks], [ac, self.vpred, self.snew])
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable(dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function( [newsum, newsumsq, newcount], [], updates=[ tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount) ]) self.debug = U.function([], [self.mean, self.std])
def __init__(self, name, ob, ac_space, num_subpolicies, network='mlp', gaussian_fixed_var=True): self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var shape = [] for d in range(1, len(ob.shape)): shape.append(ob.shape[d]) with tf.variable_scope("obfilter", reuse=tf.AUTO_REUSE): self.ob_rms = RunningMeanStd(shape=shape) obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) with tf.variable_scope(name): self.scope = tf.get_variable_scope().name if network == 'mlp': hid_size = 64 num_hid_layers = 2 self.hid_size = hid_size self.num_hid_layers = num_hid_layers self._mlp(obs, num_subpolicies, hid_size, num_hid_layers, ac_space, gaussian_fixed_var) elif network == 'cnn': self._cnn(obs, num_subpolicies) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])
def __init__(self, env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64): self.policy = policy self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize self.num_subpolicies = len(sub_policies) self.sub_policies = sub_policies ob_space = env.observation_space ac_space = env.action_space if WRITE_SCALAR: self.scalar_writer = tf.summary.FileWriter(osp.join("savedir/",'checkpoints', 'scalar%d' % time.time())) # for training theta # inputs for training theta ob = U.get_placeholder_cached(name="ob") ac = policy.pdtype.sample_placeholder([None]) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param) self.master_policy_var_list = policy.get_trainable_variables() self.master_loss = U.function([ob, ac, atarg, ret], U.flatgrad(total_loss, self.master_policy_var_list)) self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm) summ = tf.summary.scalar("total_loss", total_loss) self.calc_summary = U.function([ob, ac, atarg, ret],[summ]) self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())]) self.assign_subs = [] self.change_subs = [] self.adams = [] self.losses = [] self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None]) for i in range(self.num_subpolicies): varlist = sub_policies[i].get_trainable_variables() self.adams.append(MpiAdam(varlist)) # loss for test loss = self.policy_loss(sub_policies[i], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param) self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist))) self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())])) self.zerograd = U.function([], self.nograd(varlist)) U.initialize() self.master_adam.sync() for i in range(self.num_subpolicies): self.adams[i].sync()
def __init__(self, name, ob, hid_size, num_hid_layers, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): if (len(ob.shape) == 2): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) elif (len(ob.shape) == 3): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] * ob.get_shape()[2])) elif (len(ob.shape) == 4): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] * ob.get_shape()[2] * ob.get_shape()[3])) #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = tf.clip_by_value( U.sum( U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]), 0.0, 1000.0) # sample actions self._act = U.function([ob], [self.vpred])
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],)) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # sub policy self.pdtype = pdtype = make_pdtype(ac_space) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "pol%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: self.pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(self.pdparam) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, envs, policies, sub_policies, old_policies, old_sub_policies, clip_param=0.2, vfcoeff=1., entcoeff=0, divcoeff=0., optim_epochs=10, master_lr=1e-3, sub_lr=3e-4, optim_batchsize=64, envsperbatch=None, num_rollouts=None, nlstm=256, recurrent=False): self.policies = policies self.sub_policies = sub_policies self.old_policies = old_policies self.old_sub_policies = old_sub_policies self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_batchsize = optim_batchsize self.num_master_groups = num_master_groups = len(policies) self.num_subpolicies = num_subpolicies = len(sub_policies) self.ob_space = envs[0].observation_space self.ac_space = envs[0].action_space self.nbatch = nbatch = num_rollouts * envsperbatch self.envsperbatch = envsperbatch self.master_obs = [U.get_placeholder(name="master_ob_%i"%x, dtype=tf.float32, shape=[None] + list(self.ob_space.shape)) for x in range(num_master_groups)] self.master_acs = [policies[0].pdtype.sample_placeholder([None]) for _ in range(num_master_groups)] self.master_atargs = [tf.placeholder(dtype=tf.float32, shape=[None]) for _ in range(num_master_groups)] self.master_ret = [tf.placeholder(dtype=tf.float32, shape=[None]) for _ in range(num_master_groups)] retvals = zip(*[self.policy_loss(policies[i], old_policies[i], self.master_obs[i], self.master_acs[i], self.master_atargs[i], self.master_ret[i], clip_param, mask=tf.constant(1.), vfcoeff=vfcoeff, entcoeff=entcoeff) for i in range(num_master_groups)]) self.master_losses, self.master_kl, self.master_pol_surr, self.master_vf_loss, \ self.master_entropy, self.master_values, _ = retvals master_trainers = [tf.train.AdamOptimizer(learning_rate=master_lr, name='master_adam_%i'%_) for _ in range(num_master_groups)] master_params = [policies[i].get_trainable_variables() for i in range(num_master_groups)] master_grads = [tf.gradients(self.master_losses[i], master_params[i]) for i in range(num_master_groups)] master_grads = [list(zip(g, p)) for g, p in zip(master_grads, master_params)] # TODO: gradient clipping self.assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_policies[i].get_variables(), policies[i].get_variables())]) for i in range(num_master_groups)] self.master_train_steps = [master_trainers[i].apply_gradients(master_grads[i]) for i in range(num_master_groups)] if not recurrent: self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32, shape=[None] + list(self.ob_space.shape)) for x in range(num_subpolicies)] self.sub_acs = [sub_policies[0].pdtype.sample_placeholder([None]) for _ in range(num_subpolicies)] self.sub_atargs = [tf.placeholder(dtype=tf.float32, shape=[None]) for _ in range(num_subpolicies)] self.sub_ret = [tf.placeholder(dtype=tf.float32, shape=[None]) for _ in range(num_subpolicies)] self.logpacs = [tf.placeholder(dtype=tf.float32, shape=[num_subpolicies, None]) for _ in range(num_subpolicies)] self.loss_masks = [tf.placeholder(dtype=tf.float32, shape=[None]) for _ in range(num_subpolicies)] if recurrent: self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32, shape=[nbatch] + list(self.ob_space.shape)) for x in range(num_subpolicies)] self.sub_masks = [U.get_placeholder(name="masks_%i"%_, dtype=tf.float32, shape=[nbatch]) for _ in range(num_subpolicies)] self.sub_states = [U.get_placeholder(name="states_%i"%_, dtype=tf.float32, shape=[envsperbatch, 2*nlstm]) for _ in range(num_subpolicies)] sub_retvals = zip(*[self.policy_loss(sub_policies[i], old_sub_policies[i], self.sub_obs[i], self.sub_acs[i], self.sub_atargs[i], self.sub_ret[i], clip_param, mask=self.loss_masks[i], vfcoeff=vfcoeff, entcoeff=entcoeff, divcoeff=divcoeff, logpacs=None)#self.logpacs[i]) for i in range(num_subpolicies)]) self.sub_losses, self.sub_kl, self.sub_pol_surr, self.sub_vf_loss, \ self.sub_entropy, self.sub_values, self.div_loss = sub_retvals sub_trainers = [tf.train.AdamOptimizer(learning_rate=sub_lr) for _ in range(num_subpolicies)] sub_params = [sub_policies[i].get_trainable_variables() for i in range(num_subpolicies)] sub_grads = [tf.gradients(self.sub_losses[i], sub_params[i]) for i in range(num_subpolicies)] sub_grads = [list(zip(g, p)) for g, p in zip(sub_grads, sub_params)] # TODO: gradient clipping self.subs_assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())]) for i in range(num_subpolicies)] self.sub_train_steps = [sub_trainers[i].apply_gradients(sub_grads[i]) for i in range(num_subpolicies)] U.initialize()
def __init__(self, env, sub_policy, old_sub_policy, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, args=None): # self.policy = policy self.clip_param = clip_param self.entcoeff = entcoeff self.optim_epochs = optim_epochs self.optim_stepsize = optim_stepsize self.optim_batchsize = optim_batchsize # self.num_subpolicies = len(sub_policies) self.sub_policy = sub_policy self.args = args ob_space = env.observation_space ac_space = env.action_space # for training theta # inputs for training theta ob = U.get_placeholder_cached(name="ob") # ac = policy.pdtype.sample_placeholder([None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return entcoeff = tf.placeholder(dtype=tf.float32, name="entcoef") # total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param, entcoeff) # self.master_policy_var_list = policy.get_trainable_variables() # self.master_loss = U.function([ob, ac, atarg, ret, entcoeff], U.flatgrad(total_loss, self.master_policy_var_list)) # self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm) # self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) # for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())]) self.assign_subs = [] self.change_subs = [] self.adams = [] self.losses = [] self.sp_ac = sub_policy.pdtype.sample_placeholder([None]) # for i in range(self.num_subpolicies): varlist = sub_policy.get_trainable_variables() self.adams.append(MpiAdam(varlist)) # loss for test loss = self.policy_loss(sub_policy, old_sub_policy, ob, self.sp_ac, atarg, ret, clip_param, entcoeff) self.losses.append( U.function([ob, self.sp_ac, atarg, ret, entcoeff], U.flatgrad(loss, varlist))) self.assign_subs.append( U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_sub_policy.get_variables(), sub_policy.get_variables()) ])) self.zerograd = U.function([], self.nograd(varlist)) U.initialize() # self.master_adam.sync() # for i in range(self.num_subpolicies): self.adams[0].sync()
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, num_subpolicies, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var self.num_subpolicies = num_subpolicies with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob / 255.0 # value function last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) '''Conv2d''' last_out = tf.nn.relu( U.conv2d(last_out, 32, "l1", [8, 8], [4, 4], pad="VALID")) last_out = tf.nn.relu( U.conv2d(last_out, 64, "l2", [4, 4], [2, 2], pad="VALID")) last_out = tf.nn.relu( U.conv2d(last_out, 32, "l3", [3, 3], [1, 1], pad="VALID")) last_out = U.flattenallbut0(last_out) last_out = tf.nn.relu( tf.layers.dense(last_out, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # master policy # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "masterpol%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.selector = U.dense(last_out, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])