def make_duel_critics(self, obs=None, action=None, reuse=False, scope="duel_values_fn"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": critics_h_duel = self.cnn_extractor(obs, **self.cnn_kwargs) else: critics_h_duel = tf.layers.flatten(obs) # Concatenate preprocessed state and action qf_h_duel = tf.concat([critics_h_duel, action], axis=-1) # Double Q values to reduce overestimation with tf.variable_scope('qf1', reuse=reuse): qf1_h_duel = mlp(qf_h_duel, action) qf1_duel = tf.layers.dense(qf1_h_duel, 1, name="qf1") # qf1 = tf.layers.dense(qf1_h, 1, name="qf1", activation=tf.nn.relu) with tf.variable_scope('qf2', reuse=reuse): qf2_h_duel = mlp(qf_h_duel, action) qf2_duel = tf.layers.dense(qf2_h_duel, 1, name="qf1") # qf2 = tf.layers.dense(qf1_h, 1, name="qf1", activation=tf.nn.relu) self.qf1_duel = qf1_duel self.qf2_duel = qf2_duel return self.qf1_duel, self.qf2_duel
def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: critics_h = tf.layers.flatten(obs) # Concatenate preprocessed state and action qf_h = tf.concat([critics_h, action], axis=-1) # Double Q values to reduce overestimation with tf.variable_scope('qf1', reuse=reuse): qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.layers.dense(qf1_h, 1, name="qf1") with tf.variable_scope('qf2', reuse=reuse): qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.layers.dense(qf2_h, 1, name="qf2") self.qf1 = qf1 self.qf2 = qf2 return self.qf1, self.qf2
def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn", create_vf=True, create_qf=True): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: critics_h = tf.layers.flatten(obs) if create_vf: # Value function with tf.variable_scope('vf', reuse=reuse): vf_h = mlp(critics_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) value_fn = tf.layers.dense(vf_h, self.n_spt, name="vf", activation="softmax") self.value_fn = value_fn if create_qf: # Concatenate preprocessed state and action qf_h = tf.concat([critics_h, action], axis=-1) # Double Q values to reduce overestimation with tf.variable_scope('qf1', reuse=reuse): qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.layers.dense(qf1_h, self.n_spt, name="qf1", activation="softmax") with tf.variable_scope('qf2', reuse=reuse): qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.layers.dense(qf2_h, self.n_spt, name="qf2", activation="softmax") self.qf1 = qf1 self.qf2 = qf2 return self.qf1, self.qf2, self.value_fn
def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=None, graph=self.graph) self.observation_ph, self.processed_obs = observation_input( self.venv.observation_space, scale=(self.network_type == "cnn")) with tf.variable_scope("target_model"): if self.network_type == 'cnn': self.target_network = small_convnet( self.processed_obs, tf.nn.leaky_relu) elif self.network_type == 'mlp': self.target_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.target_network = tf_layers.linear( self.target_network, "out", 512) else: raise ValueError("Unknown network type {}!".format( self.network_type)) with tf.variable_scope("predictor_model"): if self.network_type == 'cnn': self.predictor_network = tf.nn.relu( small_convnet(self.processed_obs, tf.nn.leaky_relu)) elif self.network_type == 'mlp': self.predictor_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.predictor_network = tf.nn.relu( tf_layers.linear(self.predictor_network, "pred_fc1", 512)) self.predictor_network = tf_layers.linear( self.predictor_network, "out", 512) with tf.name_scope("loss"): self.int_reward = tf.reduce_mean(tf.square( tf.stop_gradient(self.target_network) - self.predictor_network), axis=1) self.aux_loss = tf.reduce_mean( tf.square( tf.stop_gradient(self.target_network) - self.predictor_network)) with tf.name_scope("train"): self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.training_op = self.optimizer.minimize(self.aux_loss) self.params = tf.trainable_variables() tf.global_variables_initializer().run(session=self.sess)
def make_quantile_function(self, obs=None, action=None, reuse=False, scope="quantile_fn", n_support=64): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: critics_h = tf.layers.flatten(obs) # Concatenate preprocessed state and action qi_h = tf.concat([critics_h, action], axis=-1) qi_h = mlp(qi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) #qi_h = tf.layers.dense(qi_h, n_support, name="qi") #logqi = tf.nn.log_softmax(qi_h,axis=1) #qi = tf.exp(logqi) qi = tf.layers.dense(qi_h, n_support, tf.nn.softmax, name="qi") logqi = tf.log(qi) tau = tf.math.cumsum(qi, axis=1) tau = tf.concat([tf.zeros([tf.shape(tau)[0], 1]), tau], axis=-1) tau_hats = tf.stop_gradient((tau[:, :-1] + tau[:, 1:]) / 2.0) entropies = -tf.reduce_sum(logqi * qi, axis=-1, keepdims=True) return qi, tau, tau_hats, entropies
def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: pi_h = tf.layers.flatten(obs) pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) if reuse: policy = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=tf.tanh) else: self.policy = policy = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=tf.tanh) return policy
def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs if self.obs_module_indices is not None: obs = tf.gather(obs, self.obs_module_indices["pi"], axis=-1) with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": pi_h = self.cnn_extractor(obs, name="pi_c1", act_fun=self.activ_fn, **self.cnn_kwargs) else: pi_h = tf.layers.flatten(obs) pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) self.act_mu = mu_ = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) # Important difference with SAC and other algo such as PPO: # the std depends on the state, so we cannot use stable_baselines.common.distribution log_std = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) # Regularize policy output (not used for now) # reg_loss = self.reg_weight * 0.5 * tf.reduce_mean(log_std ** 2) # reg_loss += self.reg_weight * 0.5 * tf.reduce_mean(mu ** 2) # self.reg_loss = reg_loss # OpenAI Variation to cap the standard deviation # activation = tf.tanh # for log_std # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # Original Implementation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) self.std = std = tf.exp(log_std) # Reparameterization trick pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std logp_pi = gaussian_likelihood(pi_, mu_, log_std) self.entropy = gaussian_entropy(log_std) # MISSING: reg params for log and mu # Apply squashing and account for it in the probability deterministic_policy, policy, logp_pi = apply_squashing_func( mu_, pi_, logp_pi) self.policy = policy self.deterministic_policy = deterministic_policy return deterministic_policy, policy, logp_pi
def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn", model_type="QR", iqn_tau=None, n_support=64): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: critics_h = tf.layers.flatten(obs) # Concatenate preprocessed state and action qf_h = tf.concat([critics_h, action], axis=-1) # Double Q values to reduce overestimation if model_type == "QR": with tf.variable_scope('qf1', reuse=reuse): qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) #qf1_h = mlp_ficnn(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.layers.dense(qf1_h, n_support, name="qf1") with tf.variable_scope('qf2', reuse=reuse): qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) #qf2_h = mlp_ficnn(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.layers.dense(qf2_h, n_support, name="qf2") elif model_type == "IQN" or model_type == "FQF": embeding_size = self.layers[0] pi_mtx = tf.constant(np.expand_dims(np.pi * np.arange(0, 128), axis=0), dtype=tf.float32) costau = tf.cos(tf.matmul(tf.reshape(iqn_tau, [-1, 1]), pi_mtx)) with tf.variable_scope('qf1', reuse=reuse): qf1_h_embeding = tf.layers.dense( inputs=qf_h, units=embeding_size, activation=self.activ_fn, name="qf1_embeding", kernel_initializer=tf.initializers.random_normal( stddev=0.01)) phi1 = tf.layers.dense( costau, embeding_size, self.activ_fn, name="cos_embeding", kernel_initializer=tf.initializers.random_normal( stddev=0.01)) if self.layer_norm: qf1_h_embeding = tf.contrib.layers.layer_norm( qf1_h_embeding, center=True, scale=True) phi1 = tf.contrib.layers.layer_norm(phi1, center=True, scale=True) critics1_h_embeding = tf.reshape(tf.tile( qf1_h_embeding, [1, iqn_tau.shape[1]]), shape=[-1, embeding_size]) qf1_h = tf.multiply(critics1_h_embeding, phi1) if len(self.layers) > 1: qf1_h = mlp(qf1_h, self.layers[1:], self.activ_fn, layer_norm=self.layer_norm) #qf1_h = mlp(qf1_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.reshape(tf.layers.dense(qf1_h, 1), [-1, iqn_tau.shape[1]], name="qf1") with tf.variable_scope('qf2', reuse=reuse): qf2_h_embeding = tf.layers.dense( inputs=qf_h, units=embeding_size, activation=self.activ_fn, name="qf2_embeding", kernel_initializer=tf.initializers.random_normal( stddev=0.01)) phi2 = tf.layers.dense( costau, embeding_size, self.activ_fn, name="cos_embeding", kernel_initializer=tf.initializers.random_normal( stddev=0.01)) if self.layer_norm: qf2_h_embeding = tf.contrib.layers.layer_norm( qf2_h_embeding, center=True, scale=True) phi2 = tf.contrib.layers.layer_norm(phi2, center=True, scale=True) critics2_h_embeding = tf.reshape(tf.tile( qf2_h_embeding, [1, iqn_tau.shape[1]]), shape=[-1, embeding_size]) qf2_h = tf.multiply(critics2_h_embeding, phi2) if len(self.layers) > 1: qf2_h = mlp(qf2_h, self.layers[1:], self.activ_fn, layer_norm=self.layer_norm) #qf2_h = mlp(qf2_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.reshape(tf.layers.dense(qf2_h, 1), [-1, iqn_tau.shape[1]], name="qf2") else: print("No model type : ", model_type, " please retry with 'QR' or 'IQN'") exit() self.qf1 = qf1 self.qf2 = qf2 return self.qf1, self.qf2