def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True): #assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # Apply rnn_to reduce history with tf.variable_scope("vf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # Apply rnn_to reduce history with tf.variable_scope("pf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0)) assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box) mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _create_network(self): x = self.ob # create ob filter if self.ob_filter: self.ob_rms = RunningMeanStd(shape=self.ob_space.shape) x = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # actor l = x l = tf.nn.tanh( U.dense(l, 32, "a_1", weight_init=U.normc_initializer(1.0))) l = tf.nn.tanh( U.dense(l, 32, "a_2", weight_init=U.normc_initializer(1.0))) action_layer = l # critic l = x l = tf.nn.tanh( U.dense(l, 32, "c_1", weight_init=U.normc_initializer(1.0))) l = tf.nn.tanh( U.dense(l, 32, "c_2", weight_init=U.normc_initializer(1.0))) value_layer = l self._create_logit_value(action_layer, value_layer, self.gaussian_fixed_var)
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _create_logit_value(self, action_layer, value_layer, gaussian_fixed_var=False): # actor if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box): mean = U.dense(action_layer, self.pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(action_layer, self.pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = self.pdtype.pdfromflat(pdparam) self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) # critic self.vpred = U.dense(value_layer, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape) #obs print(ob_shape) self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): ''' h = conv(X, 'c1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h2 = conv(h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h3 = conv(h2, 'c3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) hh = conv(X, 'xc1', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh2 = conv(hh, 'xc2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh3 = conv(hh2, 'xc3', nf=128, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") hh3 = conv_to_fc(hh3) hh4 = fc(hh3, 'xfc1', nh=512, init_scale=np.sqrt(2)) pi = fc(h4, 'pi', nact, act=lambda x:x, init_scale=0.01) vf = fc(hh4, 'v', 1, act=lambda x:x)[:,0] ''' x = tf.nn.relu(U.conv2d(X, 32, "l1", [3, 3], [1, 1], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [3, 3], [1, 1], pad="SAME")) x = tf.nn.relu(U.conv2d(x, 128, "l3", [3, 3], [1, 1], pad="SAME")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) y = tf.nn.relu(U.conv2d(X, 32, "yl1", [3, 3], [1, 1], pad="SAME")) y = tf.nn.relu(U.conv2d(y, 64, "yl2", [3, 3], [1, 1], pad="SAME")) y = tf.nn.relu(U.conv2d(y, 128, "yl3", [3, 3], [1, 1], pad="SAME")) y = U.flattenallbut0(y) y = tf.nn.relu(U.dense(y, 512, 'ylin', U.normc_initializer(1.0))) pi = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) vf = U.dense(y, 1, "value", U.normc_initializer(1.0))[:, 0] self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def resnet(inputs, hid_size, name): x = U.dense(inputs, hid_size, "%s_dense1"%name, weight_init=U.normc_initializer(1.0)) #x = tf.contrib.layers.batch_norm(x) x = tf.nn.relu(x) x = U.dense(x, hid_size, "%s_dense2"%name, weight_init=U.normc_initializer(1.0)) #x = tf.contrib.layers.batch_norm(x) x = tf.nn.relu(x+inputs) return x
def build_forward(self, state, reuse): # build noise samples batch_size = [state.get_shape().as_list()[0], self.input_dim] noise_dist = tfd.Normal(loc=0., scale=1.) noise_samples = noise_dist.sample( batch_size) # size of [batchsize, action dim] # build forward last_out = state self.meandict = meandict = [] self.logstddict = logstddict = [] with tf.variable_scope('forward', reuse=reuse): for i in range(self.num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, self.hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) for k in range(self.K): mean = U.dense(last_out, self.input_dim, "polfinal_{}".format(k), U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd_{}".format(k), shape=[1, self.input_dim], initializer=tf.zeros_initializer()) meandict.append(mean) logstddict.append(logstd) meandicttf = tf.concat(meandict, axis=1) # size of [batchsize, action dim * K] logstddicttf = tf.concat(logstddict, axis=1) # generate masks logits = [0.0] * self.K num_samples = self.state.shape.as_list()[0] categorical_mask = tf.multinomial([logits], num_samples) #print('categoricalmask', categorical_mask) onehot_mask = tf.squeeze(tf.one_hot(categorical_mask, self.K), 0) #print('onehotmask', onehot_mask) onehot_mask_tiled = tf.squeeze(tf.reshape( tf.tile(tf.expand_dims(onehot_mask, axis=2), [1, 1, self.input_dim]), [-1, self.input_dim * self.K, 1]), axis=2) # select mean_tiled = tf.multiply( onehot_mask_tiled, meandicttf) # size of [batchsize, action dim * K] logstd_tiled = tf.multiply(onehot_mask_tiled, logstddicttf) # sample action mean and logstd mean = tf.reshape( mean_tiled, [-1, self.K, self.input_dim]) # size of [batchsize, K, action dim] logstd = tf.reshape(logstd_tiled, [-1, self.K, self.input_dim]) mean_final = tf.reduce_sum( mean, axis=1, keepdims=True) # size of [batchsize, action dim] logstd_final = tf.reduce_sum(logstd, axis=1, keepdims=True) # sample action action = tf.exp(logstd_final) * noise_samples + mean_final self.y_sample = action
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, num_units=3, num_layers=4): assert isinstance(ob_space, gym.spaces.Box) nbatch_train = 1024 nbatch_vf_train = 64 nbatch_fvp_train = 205 # sub-sampled size self.ob_train = ob_train = U.get_placeholder(name="ob_train", dtype=tf.float32, shape=[nbatch_train] + list(ob_space.shape)) self.action_train = action_train = U.get_placeholder(name='ac_train', dtype=tf.float32, shape=[nbatch_train] + list(ac_space.shape)) ob_act = U.get_placeholder(name="ob_act", dtype=tf.float32, shape=[1] + list(ob_space.shape)) action_act = U.get_placeholder(name='ac_act', dtype=tf.float32, shape=[1] + list(ac_space.shape)) self.ob_vf_train = ob_vf_train = U.get_placeholder(name="ob_vf_train", dtype=tf.float32, shape=[nbatch_vf_train] + list(ob_space.shape)) self.ob_fvp_train = ob_fvp_train = U.get_placeholder(name="ob_fvp_train", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ob_space.shape)) self.ac_fvp_train = action_fvp_train = U.get_placeholder(name="ac_fvp_act", dtype=tf.float32, shape=[nbatch_fvp_train] + list(ac_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz_train = tf.clip_by_value((ob_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_act = tf.clip_by_value((ob_act - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_vf_train = tf.clip_by_value((ob_vf_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_fvp_train = tf.clip_by_value((ob_fvp_train - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # value function last_out = obz_vf_train with tf.variable_scope('value', reuse=False): for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred_train = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz_act with tf.variable_scope('value', reuse=True): for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred_act = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # policy policy_train = NormalizingFlowStateModel(obz_train, action_train, name='policy', reuse=False, num_units=num_units, num_layers=num_layers) policy_act = NormalizingFlowStateModel(obz_act, action_act, name='policy', reuse=True, num_units=num_units, num_layers=num_layers) policy_fvp_train = NormalizingFlowStateModel(obz_fvp_train, action_fvp_train, name='policy', reuse=True, num_units=num_units, num_layers=num_layers) self.pi_act = policy_act.y_sample #act for forward sampling self.pi_train = policy_fvp_train.y_sample #for fvp self.entropy_train = policy_train.entropy self.log_prob_act = policy_act.log_prob self.action_act = action_act self.log_prob_train = policy_train.log_prob #logprob self.log_prob_fvp_train = policy_fvp_train.log_prob self.state_in = [] self.state_out = [] #stochastic = tf.placeholder(dtype=tf.bool, shape=()) #ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #self._act = U.function([stochastic, ob], [ac, self.vpred]) self._act = U.function([ob_act], [self.pi_act, self.vpred_act]) self.ob_act = ob_act
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0] ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape]) if activation == 'tanh': activ = tf.nn.tanh elif activation == 'elu': activ = tf.nn.elu elif activation == 'lrelu': activ = lambda x: tf.maximum(x, 0.01 * x) else: raise NotImplementedError("Not available activation: " + activation) if PREPROCESS: last_out = ob else: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1)) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def build_graph(self, ob, ac, scope, hid_layer, hid_size, out_size): filters, strides, cnn_type = U.cnn(self.rnd_cnn_type) logger.log(f'critic cnn type: {cnn_type}') with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): cnn_layer = tf.nn.conv2d(ob, filters[0], strides=strides[0], padding="VALID") assert len(filters) > 1 and len(strides) == len(filters) for i in np.arange(1, len(filters)): cnn_layer = tf.nn.conv2d(cnn_layer, filters[i], strides[i], "VALID") ob = tf.reshape(cnn_layer, [-1, int(np.prod(cnn_layer.shape[1:]))]) # flatten cnn output, except the batch axis #1100+ logger.log(f"critic cnn ob output shape: {ob.shape}") layer = ob list_of_output_shape = [500] # 1000 -> 500 -> 100 logger.log(f"critic cnn dense: {list_of_output_shape}") weights, biases = U.dense(layer, list_of_output_shape) for i in range(len(list_of_output_shape) - 1): layer = tf.add(tf.matmul(layer, weights[i]), biases[i]) layer = tf.nn.relu(layer) layer = tf.add(tf.matmul(layer, weights[-1]), biases[-1]) ob = layer layer = tf.concat([ob, ac], axis=1) for _ in range(hid_layer): layer = tf.layers.dense(layer, hid_size, activation=tf.nn.leaky_relu) layer = tf.layers.dense(layer, out_size, activation=None) logger.log(f"[ob, ac] dense hid_layer: {hid_layer}, hid_size: {hid_size}, out_size: {out_size}") return layer
def __init__(self, ob_dim, ac_dim, hid_size=128, num_hid_layers=2): #pylint: disable=W0613 X = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} last_out = X for i in range(num_hid_layers): last_out = tf.nn.selu(U.dense(last_out, hid_size, "vffc%i"%(i + 1), weight_init=U.normc_initializer(1.0))) # bias_init=0, weight_loss_dict=wd_dict vpred_n = dense(last_out, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1.0 - 0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=1.0) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def _create_network(self): l = self.ob / 255.0 if self.kind == 'small': # from A3C paper l = tf.nn.relu(U.conv2d(l, 16, "l1", [8, 8], [4, 4], pad="VALID")) l = tf.nn.relu(U.conv2d(l, 32, "l2", [4, 4], [2, 2], pad="VALID")) l = U.flattenallbut0(l) l = tf.nn.relu(U.dense(l, 256, 'lin', U.normc_initializer(1.0))) elif self.kind == 'large': # Nature DQN l = tf.nn.relu(U.conv2d(l, 32, "l1", [8, 8], [4, 4], pad="VALID")) l = tf.nn.relu(U.conv2d(l, 64, "l2", [4, 4], [2, 2], pad="VALID")) l = tf.nn.relu(U.conv2d(l, 64, "l3", [3, 3], [1, 1], pad="VALID")) l = U.flattenallbut0(l) l = tf.nn.relu(U.dense(l, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError self._create_logit_value(l, l)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, sigma_z=1.0, phi=None, normalize=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_ar_pdtype(ac_space) sequence_length = None self.sigma_z = sigma_z if not phi is None: p = len(phi) else: p = 0 ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, p + 1] + list(ob_space.shape)) acs = U.get_placeholder(name="ac", dtype=tf.float32, shape=[sequence_length, p] + list(ac_space.shape)) past_x = U.get_placeholder(name="past_x", dtype=tf.float32, shape=[sequence_length, p] + list(ac_space.shape)) update_mask = U.get_placeholder(name="update_mask", dtype=tf.float32, shape=[sequence_length, p, 1]) if normalize: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape[-1]) with tf.variable_scope('vf'): if normalize: obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) else: obz = ob last_out = obz[:, -1, :] for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, name="fc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = dense(last_out, 1, name='final', weight_init=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): obz = tf.reshape(obz, [-1, obz.shape[-1]]) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, name='fc%i'%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0]//2, name='final', weight_init=U.normc_initializer(0.01)) mean = tf.reshape(mean, [-1, mean.shape[-1] * (p + 1)]) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean[:, :ac_space.shape[-1]] * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], name='final', weight_init=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam, phi, sigma_z) self.state_in = [] self.state_out = [] ac, past_x_next = self.pd.sample(acs, past_x, update_mask) self._act = U.function([ob, acs, past_x, update_mask], [ac, self.vpred, mean, logstd, past_x_next])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, gmm_comp=1, mirror_loss=False, observation_permutation=[], action_permutation=[]): assert isinstance(ob_space, gym.spaces.Box) if mirror_loss: assert gaussian_fixed_var # assume fixed std for now self.pdtype = pdtype = make_pdtype(ac_space, gmm_comp) sequence_length = None self.mirror_loss = mirror_loss if mirror_loss: # construct permutation matrices obs_perm_mat = np.zeros((len(observation_permutation), len(observation_permutation)), dtype=np.float32) act_perm_mat = np.zeros((len(action_permutation), len(action_permutation)), dtype=np.float32) for i, perm in enumerate(observation_permutation): obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm) for i, perm in enumerate(action_permutation): act_perm_mat[i][int(np.abs(perm))] = np.sign(perm) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob params = [] for i in range(num_hid_layers): rt, pw, pb = U.dense_wparams(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)) last_out = tf.nn.tanh(rt) params.append([pw, pb]) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): if gmm_comp == 1: mean, pw, pb = U.dense_wparams(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) params.append([pw, pb]) self.mean = mean logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: means = U.dense(last_out, (pdtype.param_shape()[0] - gmm_comp) // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", initializer=tf.constant(np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2), dtype=np.float32) * (-1.0))) weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([means, means * 0.0 + logstd, weights], axis=1) elif gmm_comp == 1: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) else: meanstd = U.dense(last_out, pdtype.param_shape()[0] - gmm_comp, "polfinal", U.normc_initializer(0.01)) weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([meanstd, weights], axis=1) if mirror_loss: mirrored_ob = tf.matmul(ob, obs_perm_mat) last_val = mirrored_ob for i in range(len(params) - 1): last_val = tf.nn.tanh(tf.matmul(last_val, params[i][0]) + params[i][1]) mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][1] self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat) if gmm_comp == 1: self.pd = pdtype.pdfromflat(pdparam) else: self.pd = pdtype.pdfromflat([pdparam, gmm_comp]) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])]) self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) termination_sample = tf.constant([True]) # define the angle #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2]) #last_out = ctrl_in last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ") ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0)) #ac = tf.Print (ac, [ac], "action after selection: ") self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0, w_intfc=True): assert isinstance(ob_space, gym.spaces.Box) self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "termfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater( self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0))) # pdb.set_trace() # self.op_pi = tf.constant(1./num_options) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "intfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.intfc = tf.sigmoid( U.dense(last_out, num_options, "intfcfinal", weight_init=U.normc_initializer(1.0))) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "OP%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax( U.dense(last_out, num_options, "OPfinal", weight_init=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, kind, num_options=2, dc=0, w_intfc=True): assert isinstance(ob_space, gym.spaces.Box) self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) hidden = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) hidden = tf.nn.relu( tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = dense3D2(hidden, pdtype.param_shape()[0], "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = dense3D2(hidden, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(hidden), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater( self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(hidden), num_options, "OP", weight_init=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax( U.dense(hidden, num_options, "OPfinal", weight_init=U.normc_initializer(1.0))) self.intfc = tf.sigmoid( U.dense(hidden, num_options, "intfcfinal", weight_init=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, gmm_comp=1, mirror_loss=False, observation_permutation=[], action_permutation=[]): assert isinstance(ob_space, gym.spaces.Box) if mirror_loss: assert gaussian_fixed_var # assume fixed std for now self.pdtype = pdtype = make_pdtype( ac_space, gmm_comp ) #pd = probability distribution -- distrib of possible actions sequence_length = None self.mirror_loss = mirror_loss if mirror_loss: # construct permutation matrices obs_perm_mat = np.zeros( (len(observation_permutation), len(observation_permutation)), dtype=np.float32 ) #implements mirror loss using permutation matrices act_perm_mat = np.zeros( (len(action_permutation), len(action_permutation)), dtype=np.float32 ) #is it about // limbs learning same behavior? for i, perm in enumerate( observation_permutation): #to swap rows / cols of a matrix obs_perm_mat[i][int(np.abs(perm))] = np.sign( perm) # PA / AP -- Permutation P swaps rows / cols of A for i, perm in enumerate(action_permutation): act_perm_mat[i][int(np.abs(perm))] = np.sign(perm) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd( shape=ob_space.shape ) #obs gathered from the env, use the Root Means Square of obs matrix as input to NN obz = tf.clip_by_value( (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # Standard Normal Distrib of Obs, clipped btw [-5, 5] ! #Z = (X - μ)/σ where Z is the value on the standard normal distribution, #X is the value on the original distribution, #μ is the mean of the original distribution, and #σ is the standard deviation of the original distribution. last_out = obz #input for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense( last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0) )[:, 0] #prediction V network (predicts value V of state) -- value function last_out = obz #input params = [] for i in range(num_hid_layers): rt, pw, pb = U.dense_wparams( last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0) ) #policy function NN -- policy itself, what action given obervations? (aka state) last_out = tf.nn.tanh(rt) params.append([pw, pb]) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): #usually True if gmm_comp == 1: #usually True mean, pw, pb = U.dense_wparams( last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01) ) #final is the Gaussian distrib of all possible actions params.append([pw, pb]) self.mean = mean logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer( )) #log of std dev of actions distrib pdparam = U.concatenate( [mean, mean * 0.0 + logstd], axis=1 ) # probability distrib of actions! in given distribution else: means = U.dense(last_out, (pdtype.param_shape()[0] - gmm_comp) // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", initializer=tf.constant( np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2), dtype=np.float32) * (-1.0))) weights = tf.nn.softmax( U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([means, means * 0.0 + logstd, weights], axis=1) elif gmm_comp == 1: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) else: meanstd = U.dense(last_out, pdtype.param_shape()[0] - gmm_comp, "polfinal", U.normc_initializer(0.01)) weights = tf.nn.softmax( U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([meanstd, weights], axis=1) if mirror_loss: mirrored_obz = tf.matmul( obz, obs_perm_mat ) #for mirrorred loss, input is permutated Observations! last_val = mirrored_obz #as said in paper -- "encourage gait symmetry by measuring the symmetry of ACTIONS (instead os states) thus avoiding issue of delayed reward" for i in range(len(params) - 1): last_val = tf.nn.tanh( tf.matmul(last_val, params[i][0]) + params[i][1] ) #wanna minimize error of both Action and Mirrored Action mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][ 1] #aka action in State and Mirrored State self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat) if gmm_comp == 1: #usually True self.pd = pdtype.pdfromflat( pdparam) #flattens probability distrib params else: self.pd = pdtype.pdfromflat([pdparam, gmm_comp]) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch( stochastic, self.pd.sample(), self.pd.mode() ) #take action == sample from Diag Gaussian probalility distribution of actions self._act = U.function( [stochastic, ob], [ac, self.vpred] ) #wrapper function that when given stochastic and obs -> returns action it sampled and predicted value of state based on obs
def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob, pol_keep_prob,\ mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V, dropout_on_policy,tau, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.dropout_on_policy = dropout_on_policy # self.pdtype = pdtype = make_pdtype(ac_space, dropout_on_policy) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz self.mc_samples = mc_samples self.pol_keep_prob = pol_keep_prob self.V_keep_prob = V_keep_prob ### MAIN CHANGES ####################### # Value function with tf.variable_scope("value_function"): dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.pol_keep_prob) for i in range(num_hid_layers): if layer_norm: last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True)) apply_layer = lambda x: activation_critic( tc.layers.layer_norm(tf.layers.dense( x, hid_size_V, name="vffc%i" % (i + 1), reuse=True), center=True, scope="vffc_activation%i" % (i + 1), scale=True, reuse=True)) else: last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0))) apply_layer = lambda x: activation_critic( tf.layers.dense( x, hid_size_V, name="vffc%i" % (i + 1), reuse=True)) dropout_networks = generate_dropout_layer( apply_layer, dropout_networks, self.V_keep_prob) ## final layer self.vpred = tf.layers.dense( last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:, 0] apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ name="vffinal", reuse=True)[:,0] dropout_networks = generate_dropout_layer(apply_layer, dropout_networks, self.V_keep_prob) self.vpred_mc_mean = tf.add_n(dropout_networks) / float( len(dropout_networks)) self.vpred_dropout_networks = dropout_networks ####################### ## Policy last_out = obz with tf.variable_scope("policy"): if not self.dropout_on_policy: for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \ weight_init=U.normc_initializer(1.0)) last_out = activation_actor(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) else: dropout_networks = [last_out] * mc_samples dropout_networks = generate_dropout_layer( lambda x: x, dropout_networks, 1.0) for i in range(num_hid_layers): last_out = activation_actor( tf.layers.dense( last_out, hid_size_actor, activation=None, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0), bias_initializer=tf.zeros_initializer())) apply_layer = lambda x: activation_actor( tf.layers.dense( x, hid_size_actor, activation=None, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0), bias_initializer=tf.zeros_initializer(), reuse=True)) dropout_networks = generate_dropout_layer( apply_layer, dropout_networks, pol_keep_prob) net = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name="polfinal", activation=None, kernel_initializer=U.normc_initializer(0.01)) apply_layer = lambda x: tf.layers.dense( x, pdtype.param_shape()[0] // 2, activation=None, name="polfinal", kernel_initializer=U.normc_initializer(0.01), reuse=True) dropout_networks = generate_dropout_layer( apply_layer, dropout_networks, pol_keep_prob) self.pd = pdtype.pdfromflat(dropout_networks, tau) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz ### MAIN CHANGES ## if dropout: if dropout_on_V: vact = self.vpred_mc_mean else: vact = self.vpred if dropout_on_policy: self._actsfunc = [ U.function([ob], [x, vact]) for x in dropout_networks ] self._act = self.dropout_act else: self._actfunc = U.function([stochastic, ob], [ac, vact]) self._act = self.reg_act
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # define action and observation space self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # implement Q-function approximation last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # return the Q-function value self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # implement parametrizatzion for policy over options last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # return probabilities for the options self.op_pi = tf.nn.softmax(last_out) # always terminate self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.constant([True]) # define the control policy / intra-option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) # now also use relus to squash to -1,1 mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1 logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # sample stochastically -> this corresponds to exploration stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # choose the appropriate action, apply the ZOH if using option 0 ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # init self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # return Q-function value last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # policy over options: last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # instead of applying the softmax also define self.op_pi_orig which is the difference between the output values self.op_pi_orig = last_out0 - last_out1 #tf.math.subtract(last_out0,last_out1) self.op_pi = tf.nn.softmax(last_out) # still always terminate self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) termination_sample = tf.constant([True]) # choose the appropriate action last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) # leave here the tanh to squash to (-1,1) mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1 #mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # keep the variable which only incorporates the mean ac_mean = mean ac_mean = U.switch(option[0], ac_mean, tf.stop_gradient(ob[:, -self.ac_space_dim:])) self.ac_mean = tf.clip_by_value(ac_mean, -1.0, 1.0) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ") ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0)) #ac = tf.Print (ac, [ac], "action after selection: ") self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi]) # additional functions that return the action mean and the special values for the policy over options self._act_mean = U.function([ob, option], [ac_mean]) self._get_op_orig = U.function([ob], [self.op_pi_orig])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0, kind='small'): assert isinstance(ob_space, gym.spaces.Box) self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError # Network to compute value function and termination probabilities with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = x last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] self.vpred_ent = dense3D2(last_out, 1, "vffinal_ent", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0] termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) # Network to compute policy over options and intra_option policies last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Discrete): # mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) # else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob, option], [ac, self.vpred, self.vpred_ent, last_out]) self._get_logits = U.function([stochastic, ob, option], [self.pd.logits] ) self._get_v = U.function([ob, option], [self.vpred]) self._get_v_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self.get_vpred_ent = U.function([ob, option], [self.vpred_ent]) # Entropy value estimate self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): alpha = tf.nn.softplus( U.dense(last_out, ac_space.high.size, 'polfc_alpha', weight_init=U.normc_initializer(0.001))) + 1.0 beta = tf.nn.softplus( U.dense(last_out, ac_space.high.size, 'polfc_beta', weight_init=U.normc_initializer(0.001))) + 1.0 else: raise NotImplementedError self.pd = tfp.distributions.Beta(alpha, beta) self.state_in = [] self.state_out = [] # compute sampled action sampled_action = self.pd.sample() stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, sampled_action, self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, bound_by_sigmoid=False, sigmoid_coef=1., activation='tanh', normalize_obs=True, actions='gaussian', avg_norm_symmetry=False, symmetric_interpretation=False, stdclip=5.0, gaussian_bias=False, gaussian_from_binary=False, parallel_value=False, pv_layers=2, pv_hid_size=512, three=False): assert isinstance(ob_space, gym.spaces.Box) if actions == 'binary': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'beta': self.pdtype = pdtype = BetaPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'bernoulli': self.pdtype = pdtype = BernoulliPdType(ac_space.low.size) elif actions == 'gaussian': self.pdtype = pdtype = make_pdtype(ac_space) elif actions == 'cat_3': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 2) elif actions == 'cat_5': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 4) else: assert False sequence_length = None self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None]) if normalize_obs: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) if avg_norm_symmetry: # Warning works only for normal observations (41 numbers) ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) + self.ob_rms.mean) / 2 ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) + self.ob_rms.std) / 2 # Pretty crude else: ob_mean = self.ob_rms.mean ob_std = self.ob_rms.std obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip, stdclip) #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41) #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41) else: obz = self.ob vpreds = [] pparams = [] for part in range(1 if not three else 3): part_prefix = "" if part == 0 else "part_" + str(part) # Predicted value last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, part_prefix + "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) vpreds.append( U.dense(last_out, 1, part_prefix + "vffinal", weight_init=U.normc_initializer(1.0))) vpreds[-1] = vpreds[-1][:, 0] if parallel_value: last_out_2 = obz for i in range(pv_layers): last_out_2 = tf.nn.tanh( U.dense(last_out_2, pv_hid_size, part_prefix + "pv_vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out_2 = U.dense(last_out_2, 1, part_prefix + "pv_vffinal", weight_init=U.normc_initializer(1.0)) vpreds[-1] += last_out_2[:, 0] last_out = obz if activation == 'tanh': activation = tf.nn.tanh elif activation == 'relu': activation = tf.nn.relu for i in range(num_hid_layers): dense = U.dense(last_out, hid_size, part_prefix + "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)) last_out = activation(dense) if actions == 'gaussian': if gaussian_fixed_var: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) if bound_by_sigmoid: mean = tf.nn.sigmoid(mean * sigmoid_coef) logstd = tf.get_variable( name=part_prefix + "logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) logstd = mean * 0.0 + logstd else: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) logstd = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal_2", U.normc_initializer(0.01)) if gaussian_bias: mean = mean + 0.5 pdparam = U.concatenate([mean, logstd], axis=1) elif actions == 'beta': pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "beta_lastlayer", U.normc_initializer(0.01)) pdparam = tf.nn.softplus(pdparam) elif actions in ['bernoulli', 'binary']: if bound_by_sigmoid: raise NotImplementedError( "bound by sigmoid not implemented here") pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "polfinal", U.normc_initializer(0.01)) elif actions in ['cat_3']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat3_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) elif actions in ['cat_5']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat5_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) else: assert False pparams.append(pdparam) pparams = tf.stack(pparams) vpreds = tf.stack(vpreds) pparams = tf.transpose(pparams, perm=(1, 0, 2)) # [batchsize, networks, values] vpreds = tf.transpose(vpreds, perm=(1, 0)) # [batchsize, networks, values] self.stochastic = tf.placeholder(name="stochastic", dtype=tf.bool, shape=()) if three: batchsize = tf.shape(pdparam)[0] NO_OBSTACLES_ID = 5 OBST_DIST = [278, 279, 280, 281, 282, 283, 284, 285] # TODO: Alternative approach distances = [self.ob[:, i] for i in OBST_DIST] distances = tf.stack(distances, axis=1) no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0), tf.int32) distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1), tf.int32) no_obstacles_ahead = distances * no_obstacles # 0 if obstacles, 1 if no obstacles begin = tf.cast(tf.less(self.st, 75), tf.int32) take_id = (1 - begin) * ( 1 + no_obstacles_ahead ) # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead take_id = tf.stack((tf.range(batchsize), take_id), axis=1) pdparam = tf.gather_nd(pparams, take_id) self.vpred = tf.gather_nd(vpreds, take_id) #self.vpred = tf.Print(self.vpred, [take_id]) else: self.vpred = vpreds[:, 0] pdparam = pparams[:, 0] self.pd = pdtype.pdfromflat(pdparam) if hasattr(self.pd, 'real_mean'): real_mean = self.pd.real_mean() ac = U.switch(self.stochastic, self.pd.sample(), real_mean) else: ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([self.stochastic, self.ob, self.st], [ac, self.vpred, ob_mean, ob_std]) if actions == 'binary': self._binary_f = U.function([self.stochastic, self.ob, self.st], [ac, self.pd.flat, self.vpred])
def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob,\ mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V,gaussian_fixed_var=True, sample_dropout=False): assert isinstance(ob_space, gym.spaces.Box) self.sample_dropout = sample_dropout self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz self.mc_samples=mc_samples self.V_keep_prob=V_keep_prob ### MAIN CHANGES ####################### # Value function with tf.variable_scope("value_function"): dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.V_keep_prob) for i in range(num_hid_layers): if layer_norm: last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True)) apply_layer = lambda x : activation_critic(tc.layers.layer_norm(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), reuse=True) ,center=True,scope="vffc_activation%i"%(i+1) ,scale=True,reuse=True) ) else: last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0))) apply_layer = lambda x : activation_critic(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), reuse=True)) dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.V_keep_prob) ## final layer self.vpred = tf.layers.dense(last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:,0] apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ name="vffinal", reuse=True)[:,0] dropout_networks=generate_layer(apply_layer,dropout_networks,self.V_keep_prob) mean,variance=tf.nn.moments(tf.stack(dropout_networks), 0) self.vpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks)) self.vpred_dropout_networks=dropout_networks self.variance=variance LAMBDA = tf.placeholder(dtype=tf.float32, shape=()) self.v_lambda_variance=self.vpred_mc_mean+LAMBDA*tf.sqrt(variance) ####################### ## Policy last_out = obz with tf.variable_scope("policy"): for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \ weight_init=U.normc_initializer(1.0)) last_out = activation_actor(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz ## BUilding function Q(s,a) # last_out2=self.pd.sample() # activation=tf.nn.relu # ####################### # # Action Value function # with tf.variable_scope("Q"): # dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.keep_prob) # # ## concatenate state and action # last_out = tf.concat([last_out, last_out2], axis=-1) # # new_networks = [] # for dropout_network in dropout_networks: # dropout_network = tf.concat([dropout_network, last_out2], axis=-1) # dropout_network, mask = U.bayes_dropout(dropout_network, self.keep_prob) # new_networks.append(dropout_network) # dropout_networks = new_networks # # ### hidden layers # for i in range(num_hid_layers): # # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="Q%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) # apply_layer = lambda x : activation(tf.layers.dense(x, hid_size, activation=None, \ # name="Q%i"%(i+1), reuse=True)) # dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob) # # ## final layer # self.qpred = tf.layers.dense(last_out, 1, name="Qfinal", kernel_initializer=U.normc_initializer(1.0))[:,0] # # apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ # name="Qfinal", reuse=True)[:,0] # dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob) # # self.qpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks)) # self.qpred_dropout_networks=dropout_networks ### MAIN CHANGES ## if dropout: if dropout_on_V: if self.sample_dropout: self._act = [U.function([stochastic, ob], [ac, x]) for x in dropout_networks] else: self._act = U.function([stochastic, ob], [ac, self.vpred_mc_mean]) else: self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # Define the dimensions self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # implementation of the Q-funtion: last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # presents the value of (state,option) -> denoted as Q-fct in report self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # Implementation of the policy over options: last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # this is the output of the policy over options: self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] # Always terminate termination_sample = tf.constant([True]) # calculate the control action: -> implementation of intra option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # if stochastic is true, we sample around the mean, this corresponds to the exploration at the action level stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # determine the control action to be applied. In case of ZOH == opt 0 just use u[k-1] ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def __init__(self, ob_dim, ac_dim, hid_size=128, num_hid_layers=2): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations oldac_na = tf.placeholder( tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder( tf.float32, shape=[None, ac_dim * 2], name="oldac_dist" ) # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} last_out = ob_no for i in range(num_hid_layers): last_out = tf.nn.selu( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer( 1.0))) # bias_init=0.0, weight_loss_dict=wd_dict mean_na = dense(last_out, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable( "logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([ tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim]) ], 1) sampled_ac_na = tf.random_normal( tf.shape(ac_dist[:, ac_dim:]) ) * ac_dist[:, ac_dim:] + ac_dist[:, : ac_dim] # This is the sampled action we'll perform. logprobsampled_n = -U.sum(tf.log( ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( 2.0 * np.pi) * ac_dim - 0.5 * U.sum( tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])), axis=1) # Logprob of sampled action logprob_n = -U.sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( 2.0 * np.pi ) * ac_dim - 0.5 * U.sum( tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])), axis=1 ) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = -U.mean( adv_n * logprob_n ) # Loss function that we'll differentiate to get the policy gradient surr_sampled = -U.mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n ]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ( (ob_no, oldac_na, adv_n), surr, surr_sampled ) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) bins = ac_space.high[0] - ac_space.low[0] + 1 print('making policy bins size {}'.format(bins)) assert bins is not None act_dim = len(ac_space.high) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): raise NotImplementedError mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: m = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) norm_softm = tf.nn.sigmoid( m ) # of size [batchsize, num-actions*bins], initialized to be about uniform norm_softm = tf.reshape( norm_softm, [-1, act_dim, bins] ) # of size [batchsize, num-actions, bins], initialized to be about uniform norm_softm_tiled = tf.tile(tf.expand_dims(norm_softm, axis=-1), [1, 1, 1, bins]) # construct the mask am_numpy = construct_mask(bins) am_tf = tf.constant(am_numpy, dtype=tf.float32) # construct pdparam pdparam = tf.reduce_sum( tf.math.log(norm_softm_tiled + 1e-8) * am_tf + tf.math.log(1 - norm_softm_tiled + 1e-8) * (1 - am_tf), axis=-1) pdparam = tf.reshape(pdparam, [-1, act_dim * bins]) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])