def _create_logit_value(self, action_layer, value_layer, gaussian_fixed_var=False): # actor if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box): mean = U.dense(action_layer, self.pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(action_layer, self.pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = self.pdtype.pdfromflat(pdparam) self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) # critic self.vpred = U.dense(value_layer, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True): #assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # Apply rnn_to reduce history with tf.variable_scope("vf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] # Apply rnn_to reduce history with tf.variable_scope("pf"): last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units) for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0)) assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box) mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0] ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape]) if activation == 'tanh': activ = tf.nn.tanh elif activation == 'elu': activ = tf.nn.elu elif activation == 'lrelu': activ = lambda x: tf.maximum(x, 0.01 * x) else: raise NotImplementedError("Not available activation: " + activation) if PREPROCESS: last_out = ob else: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1)) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob for i in range(num_hid_layers): last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1)) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv*nsteps ob_shape = (nbatch, ob_space.shape[0]*nstack) nact = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape) #obs self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("obfilter", reuse=reuse): self.ob_rms = RunningMeanStd(shape=ob_shape[1:]) with tf.variable_scope("retfilter", reuse=reuse): self.ret_rms = RunningMeanStd(shape=(1,)) obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #obz = X with tf.variable_scope("model", reuse=reuse): h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0)) mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0) logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer()) logstd = tf.expand_dims(logstd, 0) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0) v0 = vf[:, 0] self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.initial_state = [] #not stateful def step(stoch, ob, *_args, **_kwargs): a, v = sess.run([a0, v0], {stochastic:stoch, X:ob}) return a, v, [] #dummy state def value(ob, *_args, **_kwargs): return sess.run(v0, {X:ob}) self.X = X self.vf = vf self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std self.step = step self.value = value
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2,dc=0): assert isinstance(ob_space, gym.spaces.Box) # determine the dimensions of the state space and observation space self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim),) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value((ob[:,:-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # define the Q-function network here last_out0 = obz_pure # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "vffc0%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "vffc1%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # return the Q function value Q(s,o) self.vpred = U.switch(option[0], last_out1, last_out0)[:,0] # define the policy over options here last_out0 = obz_pure # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh(U.dense(last_out0, hid_size, "oppi0%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh(U.dense(last_out1, hid_size, "oppi1%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # return the probabilities for executing the options self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid(dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:,0] # we always terminate termination_sample = tf.constant([True]) # implement the intra option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0]//2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01),bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable(name="logstd", shape=[num_options, 1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # now we never perform the ZOH, both policies are fully functional stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) ac = tf.clip_by_value(ac,-1.0,1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0, w_intfc=True): assert isinstance(ob_space, gym.spaces.Box) self.w_intfc = w_intfc self.state_in = [] self.state_out = [] self.dc = dc self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "termfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.greater( self.tpred, tf.random_uniform(shape=tf.shape(self.tpred), maxval=1.)) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0))) # pdb.set_trace() # self.op_pi = tf.constant(1./num_options) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "intfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.intfc = tf.sigmoid( U.dense(last_out, num_options, "intfcfinal", weight_init=U.normc_initializer(1.0))) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "OP%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.op_pi = tf.nn.softmax( U.dense(last_out, num_options, "OPfinal", weight_init=U.normc_initializer(1.0))) self._act = U.function([stochastic, ob, option], [ac]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op_int = U.function([ob], [self.op_pi, self.intfc]) self._get_intfc = U.function([ob], [self.intfc]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0] #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])]) self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) termination_sample = tf.constant([True]) # define the angle #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2]) #last_out = ctrl_in last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ") ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0)) #ac = tf.Print (ac, [ac], "action after selection: ") self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob, pol_keep_prob,\ mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V, dropout_on_policy,tau, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.dropout_on_policy = dropout_on_policy # self.pdtype = pdtype = make_pdtype(ac_space, dropout_on_policy) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz self.mc_samples = mc_samples self.pol_keep_prob = pol_keep_prob self.V_keep_prob = V_keep_prob ### MAIN CHANGES ####################### # Value function with tf.variable_scope("value_function"): dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.pol_keep_prob) for i in range(num_hid_layers): if layer_norm: last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True)) apply_layer = lambda x: activation_critic( tc.layers.layer_norm(tf.layers.dense( x, hid_size_V, name="vffc%i" % (i + 1), reuse=True), center=True, scope="vffc_activation%i" % (i + 1), scale=True, reuse=True)) else: last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0))) apply_layer = lambda x: activation_critic( tf.layers.dense( x, hid_size_V, name="vffc%i" % (i + 1), reuse=True)) dropout_networks = generate_dropout_layer( apply_layer, dropout_networks, self.V_keep_prob) ## final layer self.vpred = tf.layers.dense( last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:, 0] apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ name="vffinal", reuse=True)[:,0] dropout_networks = generate_dropout_layer(apply_layer, dropout_networks, self.V_keep_prob) self.vpred_mc_mean = tf.add_n(dropout_networks) / float( len(dropout_networks)) self.vpred_dropout_networks = dropout_networks ####################### ## Policy last_out = obz with tf.variable_scope("policy"): if not self.dropout_on_policy: for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \ weight_init=U.normc_initializer(1.0)) last_out = activation_actor(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) else: dropout_networks = [last_out] * mc_samples dropout_networks = generate_dropout_layer( lambda x: x, dropout_networks, 1.0) for i in range(num_hid_layers): last_out = activation_actor( tf.layers.dense( last_out, hid_size_actor, activation=None, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0), bias_initializer=tf.zeros_initializer())) apply_layer = lambda x: activation_actor( tf.layers.dense( x, hid_size_actor, activation=None, name="polfc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0), bias_initializer=tf.zeros_initializer(), reuse=True)) dropout_networks = generate_dropout_layer( apply_layer, dropout_networks, pol_keep_prob) net = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name="polfinal", activation=None, kernel_initializer=U.normc_initializer(0.01)) apply_layer = lambda x: tf.layers.dense( x, pdtype.param_shape()[0] // 2, activation=None, name="polfinal", kernel_initializer=U.normc_initializer(0.01), reuse=True) dropout_networks = generate_dropout_layer( apply_layer, dropout_networks, pol_keep_prob) self.pd = pdtype.pdfromflat(dropout_networks, tau) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz ### MAIN CHANGES ## if dropout: if dropout_on_V: vact = self.vpred_mc_mean else: vact = self.vpred if dropout_on_policy: self._actsfunc = [ U.function([ob], [x, vact]) for x in dropout_networks ] self._act = self.dropout_act else: self._actfunc = U.function([stochastic, ob], [ac, vact]) self._act = self.reg_act
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, gmm_comp=1, mirror_loss=False, observation_permutation=[], action_permutation=[]): assert isinstance(ob_space, gym.spaces.Box) if mirror_loss: assert gaussian_fixed_var # assume fixed std for now self.pdtype = pdtype = make_pdtype( ac_space, gmm_comp ) #pd = probability distribution -- distrib of possible actions sequence_length = None self.mirror_loss = mirror_loss if mirror_loss: # construct permutation matrices obs_perm_mat = np.zeros( (len(observation_permutation), len(observation_permutation)), dtype=np.float32 ) #implements mirror loss using permutation matrices act_perm_mat = np.zeros( (len(action_permutation), len(action_permutation)), dtype=np.float32 ) #is it about // limbs learning same behavior? for i, perm in enumerate( observation_permutation): #to swap rows / cols of a matrix obs_perm_mat[i][int(np.abs(perm))] = np.sign( perm) # PA / AP -- Permutation P swaps rows / cols of A for i, perm in enumerate(action_permutation): act_perm_mat[i][int(np.abs(perm))] = np.sign(perm) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd( shape=ob_space.shape ) #obs gathered from the env, use the Root Means Square of obs matrix as input to NN obz = tf.clip_by_value( (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # Standard Normal Distrib of Obs, clipped btw [-5, 5] ! #Z = (X - μ)/σ where Z is the value on the standard normal distribution, #X is the value on the original distribution, #μ is the mean of the original distribution, and #σ is the standard deviation of the original distribution. last_out = obz #input for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense( last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0) )[:, 0] #prediction V network (predicts value V of state) -- value function last_out = obz #input params = [] for i in range(num_hid_layers): rt, pw, pb = U.dense_wparams( last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0) ) #policy function NN -- policy itself, what action given obervations? (aka state) last_out = tf.nn.tanh(rt) params.append([pw, pb]) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): #usually True if gmm_comp == 1: #usually True mean, pw, pb = U.dense_wparams( last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01) ) #final is the Gaussian distrib of all possible actions params.append([pw, pb]) self.mean = mean logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer( )) #log of std dev of actions distrib pdparam = U.concatenate( [mean, mean * 0.0 + logstd], axis=1 ) # probability distrib of actions! in given distribution else: means = U.dense(last_out, (pdtype.param_shape()[0] - gmm_comp) // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", initializer=tf.constant( np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2), dtype=np.float32) * (-1.0))) weights = tf.nn.softmax( U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([means, means * 0.0 + logstd, weights], axis=1) elif gmm_comp == 1: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) else: meanstd = U.dense(last_out, pdtype.param_shape()[0] - gmm_comp, "polfinal", U.normc_initializer(0.01)) weights = tf.nn.softmax( U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([meanstd, weights], axis=1) if mirror_loss: mirrored_obz = tf.matmul( obz, obs_perm_mat ) #for mirrorred loss, input is permutated Observations! last_val = mirrored_obz #as said in paper -- "encourage gait symmetry by measuring the symmetry of ACTIONS (instead os states) thus avoiding issue of delayed reward" for i in range(len(params) - 1): last_val = tf.nn.tanh( tf.matmul(last_val, params[i][0]) + params[i][1] ) #wanna minimize error of both Action and Mirrored Action mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][ 1] #aka action in State and Mirrored State self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat) if gmm_comp == 1: #usually True self.pd = pdtype.pdfromflat( pdparam) #flattens probability distrib params else: self.pd = pdtype.pdfromflat([pdparam, gmm_comp]) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch( stochastic, self.pd.sample(), self.pd.mode() ) #take action == sample from Diag Gaussian probalility distribution of actions self._act = U.function( [stochastic, ob], [ac, self.vpred] ) #wrapper function that when given stochastic and obs -> returns action it sampled and predicted value of state based on obs
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # init self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # return Q-function value last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # policy over options: last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # instead of applying the softmax also define self.op_pi_orig which is the difference between the output values self.op_pi_orig = last_out0 - last_out1 #tf.math.subtract(last_out0,last_out1) self.op_pi = tf.nn.softmax(last_out) # still always terminate self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.)) termination_sample = tf.constant([True]) # choose the appropriate action last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) # leave here the tanh to squash to (-1,1) mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1 #mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # keep the variable which only incorporates the mean ac_mean = mean ac_mean = U.switch(option[0], ac_mean, tf.stop_gradient(ob[:, -self.ac_space_dim:])) self.ac_mean = tf.clip_by_value(ac_mean, -1.0, 1.0) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ") ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0)) #ac = tf.Print (ac, [ac], "action after selection: ") self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi]) # additional functions that return the action mean and the special values for the policy over options self._act_mean = U.function([ob, option], [ac_mean]) self._get_op_orig = U.function([ob], [self.op_pi_orig])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # Define the dimensions self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # implementation of the Q-funtion: last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # presents the value of (state,option) -> denoted as Q-fct in report self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # Implementation of the policy over options: last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.tanh( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.tanh( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # this is the output of the policy over options: self.op_pi = tf.nn.softmax(last_out) self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] # Always terminate termination_sample = tf.constant([True]) # calculate the control action: -> implementation of intra option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) mean = tf.nn.tanh(mean) logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # if stochastic is true, we sample around the mean, this corresponds to the exploration at the action level stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # determine the control action to be applied. In case of ZOH == opt 0 just use u[k-1] ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, gmm_comp=1, mirror_loss=False, observation_permutation=[], action_permutation=[]): assert isinstance(ob_space, gym.spaces.Box) if mirror_loss: assert gaussian_fixed_var # assume fixed std for now self.pdtype = pdtype = make_pdtype(ac_space, gmm_comp) sequence_length = None self.mirror_loss = mirror_loss if mirror_loss: # construct permutation matrices obs_perm_mat = np.zeros((len(observation_permutation), len(observation_permutation)), dtype=np.float32) act_perm_mat = np.zeros((len(action_permutation), len(action_permutation)), dtype=np.float32) for i, perm in enumerate(observation_permutation): obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm) for i, perm in enumerate(action_permutation): act_perm_mat[i][int(np.abs(perm))] = np.sign(perm) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob params = [] for i in range(num_hid_layers): rt, pw, pb = U.dense_wparams(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)) last_out = tf.nn.tanh(rt) params.append([pw, pb]) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): if gmm_comp == 1: mean, pw, pb = U.dense_wparams(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) params.append([pw, pb]) self.mean = mean logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: means = U.dense(last_out, (pdtype.param_shape()[0] - gmm_comp) // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", initializer=tf.constant(np.ones((1, (pdtype.param_shape()[0] - gmm_comp) // 2), dtype=np.float32) * (-1.0))) weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([means, means * 0.0 + logstd, weights], axis=1) elif gmm_comp == 1: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) else: meanstd = U.dense(last_out, pdtype.param_shape()[0] - gmm_comp, "polfinal", U.normc_initializer(0.01)) weights = tf.nn.softmax(U.dense(last_out, gmm_comp, "gmmweights", U.normc_initializer(0.01))) pdparam = U.concatenate([meanstd, weights], axis=1) if mirror_loss: mirrored_ob = tf.matmul(ob, obs_perm_mat) last_val = mirrored_ob for i in range(len(params) - 1): last_val = tf.nn.tanh(tf.matmul(last_val, params[i][0]) + params[i][1]) mean_mir_obs = tf.matmul(last_val, params[-1][0]) + params[-1][1] self.mirrored_mean = tf.matmul(mean_mir_obs, act_perm_mat) if gmm_comp == 1: self.pd = pdtype.pdfromflat(pdparam) else: self.pd = pdtype.pdfromflat([pdparam, gmm_comp]) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, bound_by_sigmoid=False, sigmoid_coef=1., activation='tanh', normalize_obs=True, actions='gaussian', avg_norm_symmetry=False, symmetric_interpretation=False, stdclip=5.0, gaussian_bias=False, gaussian_from_binary=False, parallel_value=False, pv_layers=2, pv_hid_size=512, three=False): assert isinstance(ob_space, gym.spaces.Box) if actions == 'binary': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'beta': self.pdtype = pdtype = BetaPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'bernoulli': self.pdtype = pdtype = BernoulliPdType(ac_space.low.size) elif actions == 'gaussian': self.pdtype = pdtype = make_pdtype(ac_space) elif actions == 'cat_3': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 2) elif actions == 'cat_5': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 4) else: assert False sequence_length = None self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None]) if normalize_obs: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) if avg_norm_symmetry: # Warning works only for normal observations (41 numbers) ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) + self.ob_rms.mean) / 2 ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) + self.ob_rms.std) / 2 # Pretty crude else: ob_mean = self.ob_rms.mean ob_std = self.ob_rms.std obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip, stdclip) #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41) #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41) else: obz = self.ob vpreds = [] pparams = [] for part in range(1 if not three else 3): part_prefix = "" if part == 0 else "part_" + str(part) # Predicted value last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, part_prefix + "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) vpreds.append( U.dense(last_out, 1, part_prefix + "vffinal", weight_init=U.normc_initializer(1.0))) vpreds[-1] = vpreds[-1][:, 0] if parallel_value: last_out_2 = obz for i in range(pv_layers): last_out_2 = tf.nn.tanh( U.dense(last_out_2, pv_hid_size, part_prefix + "pv_vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out_2 = U.dense(last_out_2, 1, part_prefix + "pv_vffinal", weight_init=U.normc_initializer(1.0)) vpreds[-1] += last_out_2[:, 0] last_out = obz if activation == 'tanh': activation = tf.nn.tanh elif activation == 'relu': activation = tf.nn.relu for i in range(num_hid_layers): dense = U.dense(last_out, hid_size, part_prefix + "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)) last_out = activation(dense) if actions == 'gaussian': if gaussian_fixed_var: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) if bound_by_sigmoid: mean = tf.nn.sigmoid(mean * sigmoid_coef) logstd = tf.get_variable( name=part_prefix + "logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) logstd = mean * 0.0 + logstd else: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) logstd = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal_2", U.normc_initializer(0.01)) if gaussian_bias: mean = mean + 0.5 pdparam = U.concatenate([mean, logstd], axis=1) elif actions == 'beta': pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "beta_lastlayer", U.normc_initializer(0.01)) pdparam = tf.nn.softplus(pdparam) elif actions in ['bernoulli', 'binary']: if bound_by_sigmoid: raise NotImplementedError( "bound by sigmoid not implemented here") pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "polfinal", U.normc_initializer(0.01)) elif actions in ['cat_3']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat3_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) elif actions in ['cat_5']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat5_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) else: assert False pparams.append(pdparam) pparams = tf.stack(pparams) vpreds = tf.stack(vpreds) pparams = tf.transpose(pparams, perm=(1, 0, 2)) # [batchsize, networks, values] vpreds = tf.transpose(vpreds, perm=(1, 0)) # [batchsize, networks, values] self.stochastic = tf.placeholder(name="stochastic", dtype=tf.bool, shape=()) if three: batchsize = tf.shape(pdparam)[0] NO_OBSTACLES_ID = 5 OBST_DIST = [278, 279, 280, 281, 282, 283, 284, 285] # TODO: Alternative approach distances = [self.ob[:, i] for i in OBST_DIST] distances = tf.stack(distances, axis=1) no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0), tf.int32) distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1), tf.int32) no_obstacles_ahead = distances * no_obstacles # 0 if obstacles, 1 if no obstacles begin = tf.cast(tf.less(self.st, 75), tf.int32) take_id = (1 - begin) * ( 1 + no_obstacles_ahead ) # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead take_id = tf.stack((tf.range(batchsize), take_id), axis=1) pdparam = tf.gather_nd(pparams, take_id) self.vpred = tf.gather_nd(vpreds, take_id) #self.vpred = tf.Print(self.vpred, [take_id]) else: self.vpred = vpreds[:, 0] pdparam = pparams[:, 0] self.pd = pdtype.pdfromflat(pdparam) if hasattr(self.pd, 'real_mean'): real_mean = self.pd.real_mean() ac = U.switch(self.stochastic, self.pd.sample(), real_mean) else: ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([self.stochastic, self.ob, self.st], [ac, self.vpred, ob_mean, ob_std]) if actions == 'binary': self._binary_f = U.function([self.stochastic, self.ob, self.st], [ac, self.pd.flat, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) bins = ac_space.high[0] - ac_space.low[0] + 1 print('making policy bins size {}'.format(bins)) assert bins is not None act_dim = len(ac_space.high) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): raise NotImplementedError mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: m = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) norm_softm = tf.nn.sigmoid( m ) # of size [batchsize, num-actions*bins], initialized to be about uniform norm_softm = tf.reshape( norm_softm, [-1, act_dim, bins] ) # of size [batchsize, num-actions, bins], initialized to be about uniform norm_softm_tiled = tf.tile(tf.expand_dims(norm_softm, axis=-1), [1, 1, 1, bins]) # construct the mask am_numpy = construct_mask(bins) am_tf = tf.constant(am_numpy, dtype=tf.float32) # construct pdparam pdparam = tf.reduce_sum( tf.math.log(norm_softm_tiled + 1e-8) * am_tf + tf.math.log(1 - norm_softm_tiled + 1e-8) * (1 - am_tf), axis=-1) pdparam = tf.reshape(pdparam, [-1, act_dim * bins]) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # define action and observation space self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # implement Q-function approximation last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # return the Q-function value self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # implement parametrizatzion for policy over options last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # return probabilities for the options self.op_pi = tf.nn.softmax(last_out) # always terminate self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.constant([True]) # define the control policy / intra-option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) # now also use relus to squash to -1,1 mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1 logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # sample stochastically -> this corresponds to exploration stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # choose the appropriate action, apply the ZOH if using option 0 ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob,\ mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V,gaussian_fixed_var=True, sample_dropout=False): assert isinstance(ob_space, gym.spaces.Box) self.sample_dropout = sample_dropout self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz self.mc_samples=mc_samples self.V_keep_prob=V_keep_prob ### MAIN CHANGES ####################### # Value function with tf.variable_scope("value_function"): dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.V_keep_prob) for i in range(num_hid_layers): if layer_norm: last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True)) apply_layer = lambda x : activation_critic(tc.layers.layer_norm(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), reuse=True) ,center=True,scope="vffc_activation%i"%(i+1) ,scale=True,reuse=True) ) else: last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0))) apply_layer = lambda x : activation_critic(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), reuse=True)) dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.V_keep_prob) ## final layer self.vpred = tf.layers.dense(last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:,0] apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ name="vffinal", reuse=True)[:,0] dropout_networks=generate_layer(apply_layer,dropout_networks,self.V_keep_prob) mean,variance=tf.nn.moments(tf.stack(dropout_networks), 0) self.vpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks)) self.vpred_dropout_networks=dropout_networks self.variance=variance LAMBDA = tf.placeholder(dtype=tf.float32, shape=()) self.v_lambda_variance=self.vpred_mc_mean+LAMBDA*tf.sqrt(variance) ####################### ## Policy last_out = obz with tf.variable_scope("policy"): for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \ weight_init=U.normc_initializer(1.0)) last_out = activation_actor(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz ## BUilding function Q(s,a) # last_out2=self.pd.sample() # activation=tf.nn.relu # ####################### # # Action Value function # with tf.variable_scope("Q"): # dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.keep_prob) # # ## concatenate state and action # last_out = tf.concat([last_out, last_out2], axis=-1) # # new_networks = [] # for dropout_network in dropout_networks: # dropout_network = tf.concat([dropout_network, last_out2], axis=-1) # dropout_network, mask = U.bayes_dropout(dropout_network, self.keep_prob) # new_networks.append(dropout_network) # dropout_networks = new_networks # # ### hidden layers # for i in range(num_hid_layers): # # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="Q%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) # apply_layer = lambda x : activation(tf.layers.dense(x, hid_size, activation=None, \ # name="Q%i"%(i+1), reuse=True)) # dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob) # # ## final layer # self.qpred = tf.layers.dense(last_out, 1, name="Qfinal", kernel_initializer=U.normc_initializer(1.0))[:,0] # # apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ # name="Qfinal", reuse=True)[:,0] # dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob) # # self.qpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks)) # self.qpred_dropout_networks=dropout_networks ### MAIN CHANGES ## if dropout: if dropout_on_V: if self.sample_dropout: self._act = [U.function([stochastic, ob], [ac, x]) for x in dropout_networks] else: self._act = U.function([stochastic, ob], [ac, self.vpred_mc_mean]) else: self._act = U.function([stochastic, ob], [ac, self.vpred])