def _init(self, ob_space, ac_space): self.pdtype = distributions.make_pdtype(ac_space) ob = U.get_placeholder(name='ob', dtype=tf.int32, shape=[None] + list(ob_space.shape)) next_blocks, my_grid, opp_grid = tf.split(ob, [16, 12 * 6, 12 * 6], axis=1) with tf.variable_scope('next_blocks'): next_blocks = tf.one_hot(next_blocks, depth=5) next_blocks = U.flattenallbut0(next_blocks) next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l1', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l2', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) with tf.variable_scope('grids', reuse=False): my_grid = _grid_cnn(my_grid) with tf.variable_scope('grids', reuse=True): opp_grid = _grid_cnn(opp_grid) x = tf.concat([next_blocks, my_grid, opp_grid], axis=1) x = tf.nn.leaky_relu(tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1) logits = tf.layers.dense(x, self.pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = self.pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True): #assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # Apply rnn_to reduce history with tf.variable_scope("vf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # Apply rnn_to reduce history with tf.variable_scope("pf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0)) assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box) mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _create_logit_value(self, action_layer, value_layer, gaussian_fixed_var=False): # actor if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box): mean = U.dense(action_layer, self.pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(action_layer, self.pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = self.pdtype.pdfromflat(pdparam) self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) # critic self.vpred = U.dense(value_layer, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
def _policy_nn(self, odim, adim, train): # activ = tf.nn.tanh # self.pdtype = make_pdtype(self.ac_space) # self.pdtype = DiagGaussianPdType(self.ac_space.shape[0]) # hid1_size = 64 # out = tf.layers.dense(self.x, adim, trainable=train, # kernel_initializer=tf.random_normal_initializer(stddev=np.sqrt(1/adim)), name='out') # # self.pd = self.pdtype.pdfromflat(out) # self._act = U.function([self.ob], self.pd.sample()) # [self.pd.sample(), mean, logstd] # self .ac = self.pd.sample() # logits = tf.layers.dense(self.x, self.pdtype.param_shape()[0], name='logits', # kernel_initializer=U.normc_initializer(0.01)) # self.pd = self.pdtype.pdfromflat(logits) mean = tf.layers.dense(self.x, self.pdtype.param_shape()[0] // 2, name="polfinal", kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) # 链接 pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) stochastic = U.get_placeholder(dtype=tf.bool, shape=(), name="stochastic") ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, self.ob], ac) self.ac = ac
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, exploration_rate, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # with tf.variable_scope("obfilter"): # self.ob_rms = RunningMeanStd(shape=ob_space.shape) # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob valueFunction = Sequential() valueFunction.add(InputLayer(input_tensor=obz)) valueFunction.add(Dense(64, activation='tanh')) valueFunction.add(Dense(64, activation='tanh')) self.vpred = self.dense(x=valueFunction.output, size=1, name="vffinal", weight_init=U.normc_initializer(1.0), bias=True)[:, 0] model = Sequential() model.add(InputLayer(input_tensor=obz)) model.add(Dense(64, activation='tanh')) model.add(Dense(64, activation='tanh')) model.add(Dense(23)) model.load_weights("neural_kick") if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = model.output logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(exploration_rate)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(model.output, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1) my_var_out = tf.identity(my_var, name='output_node') self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind): print type(ob_space) assert isinstance(ob_space, gym.spaces.box.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.ob = [ob] #process ob_ x = ob / 255.0 ob_last = self.img_encoder(x, kind) with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, tau, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) print('use zpmpl_Adv') self.ac_space = ac_space self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None self.ob = U.get_placeholder(name="ob_adv", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.ob_ = U.get_placeholder(name="adv_ob_", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter_adv"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('adv_vf'): self.obz = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = self.obz for i in range(self.num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, self.hid_size, name="adv_vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name="adv_vffinal", kernel_initializer=U.normc_initializer(1.0))[:, 0] self.pdparam = self.build_action(self.ob) self.pdparam_ = self.build_action(self.ob_, reuse=True) self.pd = pdtype.pdfromflat(self.pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = self.pd.sample() self.ac_, _ = self.sample_() self._act = U.function([stochastic, self.ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None # to store current input observation ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # construct value function estimator with tf.variable_scope('vf'): # to store clipped normalized current input observation #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz = ob # last layer is input obz last_out = obz for i in range(num_hid_layers): #last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=tf.zeros_initializer())) last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) # close off the neural network #self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=tf.zeros_initializer())[:,0] self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] # construct policy network with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): #last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=tf.zeros_initializer())) last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) # continuous action space, and want state-independent variance on # output gaussian means if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): #mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=tf.zeros_initializer()) mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) self.mean = mean logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: #pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=tf.zeros_initializer()) pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) # apparently unnecessary self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0] ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape]) if activation == 'tanh': activ = tf.nn.tanh elif activation == 'elu': activ = tf.nn.elu elif activation == 'lrelu': activ = lambda x: tf.maximum(x, 0.01 * x) else: raise NotImplementedError("Not available activation: " + activation) if PREPROCESS: last_out = ob else: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1)) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, name, observation_shape, action_shape, hid_size, num_hid_layers, stochastic=True): with tf.variable_scope(name): self.stochastic = stochastic self.hid_size, self.num_hid_layers = hid_size, num_hid_layers self.action_shape, self.observation_shape = action_shape, observation_shape self.scope = tf.get_variable_scope().name self.pdtype = DiagGaussianPdType(action_shape[0]) observations_ph = U.get_placeholder(name='ob', dtype=tf.float32, shape=[None] + list(observation_shape)) stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) with tf.variable_scope('obfilter'): self.ob_rms = RunningMeanStd(shape=observation_shape) with tf.variable_scope('pol'): last_out = tf.clip_by_value( (observations_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = tf.layers.dense( last_out, self.pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name='logstd', shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) action_op = U.switch(stochastic_ph, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic_ph, observations_ph], action_op)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): # pdb.set_trace() # var_is_good = any(isinstance(ob_space, t) for t in [gym.spaces.Box,Box]) assert isinstance(ob_space, (gym.spaces.Box)) if isinstance(hid_size,int): hid_size = [hid_size] * num_hid_layers self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('sigma'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.sigmapred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred, self.sigmapred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n) for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): last_out = obz last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n) def sub_pol(input_m, scope): state_embedding = tf.tile(tf.expand_dims(input_m, axis=1), [1, 1, 1]) rnn_cell = rnn.BasicLSTMCell( num_units=pdtype.param_shape()[0]) last_out, states = tf.nn.dynamic_rnn( cell=rnn_cell, inputs=state_embedding, dtype=tf.float32, scope=scope) return tf.squeeze(last_out, axis=1) ppsl = [] for i in range(4): ppsl.append(sub_pol(last_out, 'pol' + '/' + str(i))) last_out = tf.concat(ppsl, axis=1) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01), activity_regularizer=tf.contrib.layers.l2_regularizer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01), activity_regularizer=tf.contrib.layers.l2_regularizer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) # self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] self.vpred = discriminator_model([last_out], drop_rate=0.5) with tf.variable_scope('pol'): last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) pdparam = generator_model([last_out], pdtype.param_shape()[0], drop_rate=0.5) # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): # mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) # else: # pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, summaries = False, should_act = True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = tf.get_default_graph().get_tensor_by_name("observations:0"); if ob is None: ob = U.get_placeholder(name="observations", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope('pol'): last_out = ob for i in range(num_hid_layers): last_out = tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)) last_out = tf.nn.elu(last_out); #last_out = tf.nn.tanh(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, tf.ones(shape=mean.shape)* logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) with tf.variable_scope("distribution"): self.pd = pdtype.pdfromflat(pdparam) if should_act: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = ob for i in range(num_hid_layers): last_out = tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)) last_out = tf.nn.tanh(last_out); self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] with tf.variable_scope("distribution"): stochastic = tf.placeholder(dtype=tf.bool, shape=(), name = "stochastic") ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv*nsteps ob_shape = (nbatch, ob_space.shape[0]*nstack) nact = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape) #obs self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("obfilter", reuse=reuse): self.ob_rms = RunningMeanStd(shape=ob_shape[1:]) with tf.variable_scope("retfilter", reuse=reuse): self.ret_rms = RunningMeanStd(shape=(1,)) obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #obz = X with tf.variable_scope("model", reuse=reuse): h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0)) mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0) logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer()) logstd = tf.expand_dims(logstd, 0) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0) v0 = vf[:, 0] self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.initial_state = [] #not stateful def step(stoch, ob, *_args, **_kwargs): a, v = sess.run([a0, v0], {stochastic:stoch, X:ob}) return a, v, [] #dummy state def value(ob, *_args, **_kwargs): return sess.run(v0, {X:ob}) self.X = X self.vf = vf self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std self.step = step self.value = value
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind): assert isinstance(ob_space, tuple) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape)) ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape)) self.ob = [ob_p, ob_f] #process ob_p with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape = ob_space[0].shape) obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #process ob_f x = ob_f / 255.0 x = self.img_encoder(x, kind) ob_last = tf.concat((obpz, x), axis=-1) with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred_ext = tf.layers.dense(last_out, 1, name='vf_ext', kernel_initializer=U.normc_initializer(1.0))[:,0] self.vpred_int = tf.layers.dense(last_out, 1, name='vf_int', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob_p, ob_f], [ac, self.vpred_ext, self.vpred_int])
def _init(self, ob_space, ac_space, layers_val, layers_pol, gaussian_fixed_var=True, dist='gaussian', ): assert isinstance(ob_space, gym.spaces.Box) self.dist = dist self.pdtype = pdtype = make_pdtype(ac_space, dist=dist) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i, size in enumerate(layers_val): last_out = tf.nn.relu(tf.layers.dense(last_out, size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i, size in enumerate(layers_pol): last_out = tf.nn.tanh(tf.layers.dense(last_out, size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) if dist == 'gaussian': self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.std, self.pd.mean, self.pd.logstd]) elif dist == 'beta': self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.alpha, self.pd.beta, self.pd.alpha_beta])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, exploration_rate, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(0.01))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.constant_initializer(exploration_rate)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1) my_var_out = tf.identity(my_var, name='output_node') self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _build(self): ac_space = self._ac_space num_hid_layers = self._num_hid_layers hid_size = self._hid_size gaussian_fixed_var = self._gaussian_fixed_var # obs self._obs = {} for ob_name, ob_shape in self._ob_shape.items(): self._obs[ob_name] = U.get_placeholder( name="ob_{}_primitive".format(ob_name), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) # obs normalization self.ob_rms = {} for ob_name in self.ob_type: with tf.variable_scope("ob_rms_{}".format(ob_name)): self.ob_rms[ob_name] = RunningMeanStd( shape=self._ob_shape[ob_name]) obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) # value function with tf.variable_scope("vf"): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name="final", kernel_initializer=U.normc_initializer(1.0))[:, 0] # primitive policy self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("pol"): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name="final", kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name="final", kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.obs = [self._obs[ob_name] for ob_name in self.ob_type] self._act = U.function([stochastic] + self.obs, [ac, self.vpred]) self._value = U.function(self.obs, self.vpred)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None feature_funcs = [] ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.std = tf.constant(1.0) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) import numpy as np # for i in range(0, ob_space.shape[0]): # # Polinomial # # feature_funcs.append(lambda s, i=i: tf.pow(s, i)) # # Fourier # # feature_funcs.append(lambda s, i=i: tf.cos(i*np.pi*s)) # # RBF # feature_funcs.append(lambda s, i=i: tf.exp(-tf.pow(s - self.ob_rms.mean, 2)/(2*self.ob_rms.std # **2))) # obz = tf.concat([func(ob) for func in feature_funcs], axis = 1) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(0.1))[:, 0] with tf.variable_scope('pol'): last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name = 'fc%i' % (i + 1), kernel_initializer = U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.multiply( tf.ones(shape=[1, pdtype.param_shape()[0] // 2]), tf.constant(0.05)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) pdparam = tf.clip_by_value(pdparam, -10.0, 10.0) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) next_ob = U.get_placeholder(name="next_ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) act = U.get_placeholder(name="act", dtype=tf.float32, shape=[sequence_length] + list(ac_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('qf'): obz = tf.clip_by_value( (next_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): if i == num_hid_layers - 1: last_out = tf.concat([last_out, act], axis=-1) last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.qpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): # out_std = tf.exp(0.5*logstd + 0.0) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) import numpy as np pdparam = tf.concat([ mean, mean * 0.0 + np.random.randn(pdtype.param_shape()[0] // 2) * logstd ], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = ob #tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) # Since we are using a Box for the action space # this distribution is used DiagGaussianPd self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) # if stocastic = true, the call the sample of the distribion # otherwise just use the mean ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _build(self): num_primitives = self.num_primitives num_hid_layers = self._num_hid_layers hid_size = self._hid_size self._obs = {} for ob_name, ob_shape in self._ob_shape.items(): self._obs[ob_name] = U.get_placeholder( name="ob_{}".format(ob_name), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) self._prev_primitive = prev_primitive = U.get_placeholder( name="prev_primitive", dtype=tf.int32, shape=[None]) with tf.variable_scope(self.name): self._scope = tf.get_variable_scope().name self.ob_rms = {} for ob_name in self.ob_type: with tf.variable_scope("ob_rms_{}".format(ob_name)): self.ob_rms[ob_name] = RunningMeanStd( shape=self._ob_shape[ob_name]) obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) prev_primitive_one_hot = tf.one_hot(prev_primitive, num_primitives, name="prev_primitive_one_hot") obz = tf.concat([obz, prev_primitive_one_hot], -1) # value function with tf.variable_scope("vf"): _ = obz for i in range(num_hid_layers): _ = self._activation( tf.layers.dense( _, hid_size, name="fc%d" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( _, 1, name="vpred", kernel_initializer=U.normc_initializer(1.0))[:, 0] # meta policy with tf.variable_scope("pol"): _ = obz for i in range(num_hid_layers): _ = self._activation( tf.layers.dense( _, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.selector = tf.layers.dense( _, num_primitives, name="action", kernel_initializer=U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_primitives) self.pd = pdtype.pdfromflat(self.selector) # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.obs = [self._obs[ob_name] for ob_name in self.ob_type] self._act = U.function([stochastic, self._prev_primitive] + self.obs, [ac, self.vpred])
def __init__(self, observations, action_space, latent, optimizer=None, sess=None, train=True, beta=1.0, l2=0., lr=0.001, init_scale=0.01, init_bias=0.0, trainable_variance=True, trainable_bias=True, init_logstd=0., scope_name="pi", clip=None, state_dependent_variance=True, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) latent = tf.layers.flatten(latent) self.action_space = action_space self.pdtype = make_pdtype(action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=init_scale, init_bias=init_bias, trainable_variance=trainable_variance, state_dependent_variance=state_dependent_variance, trainable_bias=trainable_bias, init_logstd=init_logstd, clip=clip, beta=beta) # init_bias=0.0 self.stochastic = tf.placeholder(dtype=tf.bool, shape=()) self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self.neglogp = self.pd.neglogp(self.action) if beta == 1.0: self.prob = tf.nn.softmax(self.pd.flatparam()) else: self.prob = boltzmann(self.pd.flatparam(), beta=beta) if optimizer is None: self.optimizer = tf.train.AdamOptimizer(learning_rate=lr) else: self.optimizer = optimizer self.sess = sess or tf.get_default_session() self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope_name) try: self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder') self.action_selected = action_selected = tf.one_hot(self.action_ph, self.action_space.n) #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1)) out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob*action_selected, axis=1))) gradients = tf.gradients(out, self.vars) except: self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + action_space.shape, name='targets_placeholder') gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars) self.cont = cont = not isinstance(self.action_space, Discrete) self.compute_gradients = tf_util.function( inputs=[self.X, self.action_ph], outputs=[gradients, tf.exp(- self.pd.neglogp(self.action_ph)), - self.pd.neglogp(self.action_ph), self.pd.mean] ) '''self.compute_cont_gradients = tf_util.function( inputs=[self.X, self.action_ph], outputs=tf.gradients(-self.pd.neglogp(self.action_ph), self.vars) )''' self.debug = tf_util.function( inputs=[self.X, self.action_ph], outputs=[gradients, self.prob, self.action_ph] ) self.set_from_flat = tf_util.SetFromFlat(self.vars) if self.cont: total_error = tf.reduce_sum(tf.square(self.action_ph - tf.reduce_mean(self.action_ph, axis=0)), axis=0) unexplained_error = tf.reduce_sum(tf.square(self.action_ph - self.pd.mean), axis=0) R_squared = 1 - (unexplained_error / total_error) self.accuracy = accuracy = R_squared else: self.accuracy = accuracy = tf.reduce_mean(tf.cast(tf.math.equal(self.pd.mode(), self.action_ph), tf.float32)) self.entropy = entropy = tf.reduce_mean(self.pd.entropy()) if train: self.gamma = l2 self._build_train(cont=cont, state_dependent_variance=state_dependent_variance) self.pdf = tf.exp(self.pd.logp(self.action_ph))
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, lstm_hid_size, kind): print("This is lstm policy for only sensors.") assert isinstance(ob_space, tuple) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape)) ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape)) #process ob_p with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape = ob_space[0].shape) obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #process ob_f x = ob_f / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError # lstm layer for memmory lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_hid_size, state_is_tuple=True, name = "rnn") c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = (c_init, h_init) c_in = U.get_placeholder(name="state_c", dtype=tf.float32,shape=(None, lstm_cell.state_size.c)) h_in = U.get_placeholder(name="state_h", dtype=tf.float32,shape=(None, lstm_cell.state_size.h)) self.state_in = (c_in, h_in) state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_states = lstm_cell(x, state_in) lstm_c, lstm_h = lstm_states self.state_out = (lstm_c, lstm_h) rnn_out = tf.reshape(lstm_outputs, (-1, lstm_hid_size)) # conjugate sensor and physics ob_last = tf.concat((rnn_out, obpz), axis = -1) # value network with tf.variable_scope("vf"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope("pol"): last_out = ob_last for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob_p, ob_f, c_in, h_in], [ac, self.vpred, lstm_c, lstm_h])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])]) self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) termination_sample = tf.constant([True]) # define the angle #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2]) #last_out = ctrl_in last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ") ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0)) #ac = tf.Print (ac, [ac], "action after selection: ") self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _build(self): ac_space = self._ac_space num_hid_layers = self._num_hid_layers hid_size = self._hid_size gaussian_fixed_var = self._gaussian_fixed_var if not isinstance(hid_size, list): hid_size = [hid_size] if len(hid_size) != num_hid_layers: hid_size += [hid_size[-1]] * (num_hid_layers - len(hid_size)) self.obs = [] self.pds = [] for j in range(self._config.num_contexts): # obs _ob = {} for ob_name, ob_shape in self._ob_shape.items(): _ob[ob_name] = U.get_placeholder( name="ob_{}/from_{}".format(ob_name, j), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) # obs normalization if self._config.obs_norm == 'learn': obz = [(_ob[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] else: obz = [_ob[ob_name] for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) # value function with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) vpred = tf.layers.dense( last_out, 1, name="final", kernel_initializer=U.normc_initializer(1.0))[:, 0] if j == self._id: self.vpred = vpred # policy pdtype = make_pdtype(ac_space) if j == self._id: self.pdtype = pdtype with tf.variable_scope('pol', reuse=tf.AUTO_REUSE): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name="final", kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name="final", kernel_initializer=U.normc_initializer(0.01)) self.obs.append([_ob[ob_name] for ob_name in self.ob_type]) self.pds.append(pdtype.pdfromflat(pdparam)) self.ob = self.obs[self._id] self.pd = self.pds[self._id] # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic] + self.ob, [ac, self.vpred]) self._value = U.function([stochastic] + self.ob, self.vpred)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, name='policy', args=None): #pylint: disable=W0613 policy_variance_state_dependent = args.policy_variance_state_dependent ac_fn = args.ac_fn hidden_sizes = args.hidden_sizes num_sharing_layers = args.num_sharing_layers num_layers = args.num_layers assert ac_fn in ['tanh', 'sigmoid', 'relu'] if isinstance(hidden_sizes, int): assert num_layers is not None hidden_sizes = [hidden_sizes] * num_layers if num_layers is None: num_layers = len(hidden_sizes) assert num_layers == len(hidden_sizes) # print(f'Policy hidden_sizes:{hidden_sizes}') self.pdtype = make_pdtype(ac_space) with tf.variable_scope(name, reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = getattr( tf.nn, ac_fn ) processed_x = tf.layers.flatten(processed_x) # --- share layers for ind_layer in range(num_sharing_layers): processed_x = activ( fc(processed_x, f'share_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)) ) # --- policy pi_h = processed_x for ind_layer in range( num_sharing_layers, num_layers ): pi_h = activ(fc(pi_h, f'pi_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) from gym import spaces params_addtional = {} if policy_variance_state_dependent and isinstance( ac_space, spaces.Box ): latent_logstd = processed_x for ind_layer in range(num_sharing_layers, num_layers): latent_logstd = activ(fc(latent_logstd, f'logstd_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) params_addtional['latent_logstd'] = latent_logstd self.pd, self.pi = self.pdtype.pdfromlatent(pi_h, init_scale=0.01, logstd_initial=args.logstd, **params_addtional) # --- value function vf_h = processed_x for ind_layer in range( num_sharing_layers, num_layers ): vf_h = activ(fc(vf_h, f'vf_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) vf = fc(vf_h, 'vf', 1)[:,0] a_sample = self.pd.sample() neglogp_sample = self.pd.neglogp(a_sample) self.initial_state = None # --- predict function # use placeholder # use stochastic action # use deterministic action if args.coef_predict_task > 0: import tensorflow.contrib.distributions as dists assert isinstance( ac_space, Box ), 'Only Implement for Box action space' A_type = tf.placeholder_with_default('pl', dtype=tf.string) A_pl = self.pdtype.sample_placeholder([None]) self.A = A_pl self.A_type = A_type A_input_1 = U.switch( tf.equal( A_type, 'det' ), self.pd.mode(), a_sample ) A_input = U.switch( tf.equal( A_type, 'pl' ), A_pl,A_input_1) predict_h = tf.concat( (processed_x, A_input)) for ind_layer in range(num_sharing_layers, num_layers): predict_h = activ(fc(predict_h, f'predict_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) predict_mean = fc(predict_h, f'predict_fc{ind_layer}', nh=ob_space.shape[0], init_scale=np.sqrt(2)) predict_cov_init_value = np.identity( shape=ob_space.shape ) predict_cov = tf.get_variable( name='predict_cov', shape=predict_cov_init_value, initializer=tf.constant_initializer(predict_cov_init_value) ) predict_dist = dists.MultivariateNormalTriL( predict_mean, predict_cov ) self.predict_dist = predict_dist scope_model = tf.get_variable_scope().name self.variables_all = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope_model) self.variables_trainable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope_model) #--- set logstd # if isinstance( ac_space, Box ): # if not policy_variance_state_dependent: # logstd_pl, _ = observation_input( ac_space, batch_size=1, name='ac' ) # assign_logstd = tf.assign( self.pdtype.logstd, logstd_pl ) # set_logstd_entity = U.function([logstd_pl], assign_logstd) # def set_logstd(logstd_new): # # if isinstance( logstd_new, float ): # # logstd_new = [[logstd_new] * ac_space.shape[0]] # set_logstd_entity(logstd_new) # self.set_logstd = set_logstd # self.get_logstd = U.function([], self.pdtype.logstd) def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a_sample, vf, neglogp_sample], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) def step_policyflat(ob, *_args, **_kwargs): a, v, neglogp, polciyflat = sess.run([a_sample, vf, neglogp_sample, self.pd.flatparam()], {X:ob}) #TODO: TEST flat for discrete action space return a, v, self.initial_state, neglogp, polciyflat def step_test(ob, *_args, **_kwargs): a = sess.run([self.pd.mode()], {X:ob}) return a self.X = X self.vf = vf self.step = step self.step_policyflat = step_policyflat self.value = value self.step_test = step_test
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0, w_intfc=True): assert isinstance(ob_space, gym.spaces.Box) self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "termfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater( self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0))) # pdb.set_trace() # self.op_pi = tf.constant(1./num_options) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "intfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.intfc = tf.sigmoid( U.dense(last_out, num_options, "intfcfinal", weight_init=U.normc_initializer(1.0))) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "OP%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax( U.dense(last_out, num_options, "OPfinal", weight_init=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])