def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, num_subpolicies, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var self.num_subpolicies = num_subpolicies with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # master policy last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "masterpol%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.selector = U.dense(last_out, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])
def __init__(self, name, ob, ac_space, network='mlp', gaussian_fixed_var=True, nsteps=None, nbatch=None, nlstm=256, states=None, masks=None, reuse=False): self.network = network shape = [] for d in range(1, len(ob.shape)): shape.append(ob.shape[d]) with tf.variable_scope(name, reuse=reuse): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=shape) obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) if network == 'mlp': hid_size = 64 num_hid_layers = 2 self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var self._mlp(obs, hid_size, num_hid_layers, ac_space, gaussian_fixed_var) elif network == 'cnn': self._cnn(obs, ac_space, gaussian_fixed_var) elif network == 'lstm': assert nsteps is not None and nbatch is not None assert states is not None and masks is not None assert isinstance(nsteps, int) and isinstance(nbatch, int) assert nsteps > 0 and nbatch > 0 self._lstm(obs, states, masks, nlstm, ac_space, nbatch, nsteps) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) if network == 'mlp' or network == 'cnn': self._act = U.function([stochastic, ob], [ac, self.vpred]) elif network == 'lstm': self._act = U.function([stochastic, ob, states, masks], [ac, self.vpred, self.snew])
def __init__(self, name, ob, ac_space, num_subpolicies, network='mlp', gaussian_fixed_var=True): self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var shape = [] for d in range(1, len(ob.shape)): shape.append(ob.shape[d]) with tf.variable_scope("obfilter", reuse=tf.AUTO_REUSE): self.ob_rms = RunningMeanStd(shape=shape) obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) with tf.variable_scope(name): self.scope = tf.get_variable_scope().name if network == 'mlp': hid_size = 64 num_hid_layers = 2 self.hid_size = hid_size self.num_hid_layers = num_hid_layers self._mlp(obs, num_subpolicies, hid_size, num_hid_layers, ac_space, gaussian_fixed_var) elif network == 'cnn': self._cnn(obs, num_subpolicies) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],)) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # sub policy self.pdtype = pdtype = make_pdtype(ac_space) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "pol%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: self.pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(self.pdparam) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, num_subpolicies, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var self.num_subpolicies = num_subpolicies with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob / 255.0 # value function last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) '''Conv2d''' last_out = tf.nn.relu( U.conv2d(last_out, 32, "l1", [8, 8], [4, 4], pad="VALID")) last_out = tf.nn.relu( U.conv2d(last_out, 64, "l2", [4, 4], [2, 2], pad="VALID")) last_out = tf.nn.relu( U.conv2d(last_out, 32, "l3", [3, 3], [1, 1], pad="VALID")) last_out = U.flattenallbut0(last_out) last_out = tf.nn.relu( tf.layers.dense(last_out, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # master policy # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "masterpol%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.selector = U.dense(last_out, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])