def build_graph(self, obs_ph, reuse=False): with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd(shape=self.observation_shape) obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std p_h1 = tf.contrib.layers.fully_connected(obs, self.hidden_size, activation_fn=tf.nn.tanh) p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh) logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity) return logits
def build_graph(self, obs_ph, acs_ph, reuse=False): with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd(shape=self.observation_shape) obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std _input = tf.concat([obs, acs_ph], axis=1) # concatenate the two input -> form a transition p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh) p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh) logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity) return logits
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.obs = ob with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) self.v_preds = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] self.pd, self.pi = pdtype.pdfromlatent(last_out) # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): # mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) # else: # pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) # self.pd = pdtype.pdfromflat(pdparam) # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.v_preds])
def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]): rms = RunningMeanStd(shape=x.shape[1:]) norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) return norm_x, rms
def __init__(self, env, hidden_size, expert_dataset): self.hidden_size = hidden_size self.expert_dataset = expert_dataset with tf.variable_scope('guidance'): self.scope = tf.get_variable_scope().name self.agent_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape), name='ph_agent_s') self.agent_a = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.action_space.shape), name='ph_agent_a') self.expert_a = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.action_space.shape), name='ph_expert_a') with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd( shape=env.observation_space.shape) obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std layer_s = tf.layers.dense(inputs=obs_ph_rms, units=self.hidden_size, activation=tf.nn.leaky_relu, name='layer_s') layer_a = tf.layers.dense(inputs=self.agent_a, units=self.hidden_size, activation=tf.nn.leaky_relu, name='layer_a') layer_s_a = tf.concat([layer_s, layer_a], axis=1) layer = tf.layers.dense(inputs=layer_s_a, units=self.hidden_size, activation=tf.nn.leaky_relu, name='layer1') output = tf.layers.dense(inputs=layer, units=env.action_space.shape[0], activation=tf.identity, name='layer2') ########## # BUG ########## # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty labels = tf.nn.softmax(self.expert_a) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=output)) optimizer = tf.train.AdamOptimizer() self.train_op = optimizer.minimize(self.loss) self.loss_name = ["guidance_loss"] var_list = self.get_trainable_variables() self.lossandgrad = U.function( [self.agent_s, self.agent_a, self.expert_a], [self.loss] + [U.flatgrad(self.loss, var_list)])
def __init__(self, env, hidden_size, expert_dataset): self.obs = expert_dataset.inputs self.acs = expert_dataset.labels with tf.variable_scope('guidance'): self.scope = tf.get_variable_scope().name self.agent_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape), name='ph_agent_s') self.agent_a = tf.placeholder(dtype=tf.int32, shape=[None], name='ph_agent_a') agent_a_one_hot = tf.one_hot(self.agent_a, depth=env.action_space.n) self.expert_a = tf.placeholder(dtype=tf.int32, shape=[None], name='ph_expert_a') expert_a_one_hot = tf.one_hot(self.expert_a, depth=env.action_space.n) with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd( shape=env.observation_space.shape) obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std layer_s = tf.layers.dense(inputs=obs_ph_rms, units=hidden_size, activation=tf.nn.leaky_relu, name='layer_s') layer_a = tf.layers.dense(inputs=agent_a_one_hot, units=hidden_size, activation=tf.nn.leaky_relu, name='layer_a') layer_s_a = tf.concat([layer_s, layer_a], axis=1) layer = tf.layers.dense(inputs=layer_s_a, units=hidden_size, activation=tf.nn.leaky_relu, name='layer1') output = tf.layers.dense(inputs=layer, units=env.action_space.n, activation=tf.nn.softmax, name='layer2') loss = tf.keras.losses.categorical_crossentropy( y_true=expert_a_one_hot, y_pred=output) # loss = tf.nn.softmax_cross_entropy_with_logits(labels=expert_a_one_hot, logits=output) self.loss = tf.reduce_mean(loss) ########## # BUG ########## # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty # self.loss = loss_func(structured_generator_inputs=output, predicted_distributions=expert_a_one_hot) optimizer = tf.train.AdamOptimizer() self.train_op = optimizer.minimize(self.loss) self.loss_name = ["guidance_loss"] var_list = self.get_trainable_variables() self.lossandgrad = U.function( [self.agent_s, self.agent_a, self.expert_a], [self.loss] + [U.flatgrad(self.loss, var_list)])