def _mlp(self, obs, num_subpolicies, hid_size, num_hid_layers, ac_space, gaussian_fixed_var): # value function last_out = obs for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # master policy last_out = obs for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "masterpol%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.selector = U.dense(last_out, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector)
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, num_subpolicies, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var self.num_subpolicies = num_subpolicies with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # master policy last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "masterpol%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.selector = U.dense(last_out, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])
def _cnn(self, obs, num_subpolicies): features = feature_net(obs) self.vpred = U.dense(features, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] self.selector = U.dense(features, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector)
def __init__(self, name, ob, hid_size, num_hid_layers, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): if (len(ob.shape) == 2): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) elif (len(ob.shape) == 3): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] * ob.get_shape()[2])) elif (len(ob.shape) == 4): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] * ob.get_shape()[2] * ob.get_shape()[3])) #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = tf.clip_by_value( U.sum( U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]), 0.0, 1000.0) # sample actions self._act = U.function([ob], [self.vpred])
def _cnn(self, obs, ac_space, gaussian_fixed_var): features = feature_net(obs) self.vpred = U.dense(features, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] self.pdtype = pdtype = make_pdtype(ac_space) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(features, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: self.pdparam = U.dense(features, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(self.pdparam)
def __init__(self, name, ob): with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],)) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) x = tf.nn.relu(U.conv2d(obz, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 64, 'lin', U.normc_initializer(1.0))) self.ob = x
def _mlp(self, obs, hid_size, num_hid_layers, ac_space, gaussian_fixed_var): # value function last_out = obs for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # sub policy self.pdtype = pdtype = make_pdtype(ac_space) last_out = obs for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "pol%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: self.pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(self.pdparam)
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],)) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # obz = ob # value function last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # sub policy self.pdtype = pdtype = make_pdtype(ac_space) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "pol%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: self.pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(self.pdparam) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, num_subpolicies, gaussian_fixed_var=True): self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var self.num_subpolicies = num_subpolicies with tf.variable_scope(name): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], )) # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob / 255.0 # value function last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) '''Conv2d''' last_out = tf.nn.relu( U.conv2d(last_out, 32, "l1", [8, 8], [4, 4], pad="VALID")) last_out = tf.nn.relu( U.conv2d(last_out, 64, "l2", [4, 4], [2, 2], pad="VALID")) last_out = tf.nn.relu( U.conv2d(last_out, 32, "l3", [3, 3], [1, 1], pad="VALID")) last_out = U.flattenallbut0(last_out) last_out = tf.nn.relu( tf.layers.dense(last_out, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # master policy # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "masterpol%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.selector = U.dense(last_out, num_subpolicies, "masterpol_final", U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_subpolicies) self.pd = pdtype.pdfromflat(self.selector) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])