def critic_update(self, state, action, next_state, done, reward, n_state, n_done, n_reward, actual_n, weights, gamma): print("Critic update tracing") critic_variables = self.online_critic.trainable_variables with tf.GradientTape() as tape: tape.watch(critic_variables) q_value1, q_value2 = self.online_critic( { 'state': state, 'action': action }, training=True) q_value1, q_value2 = tf.squeeze(q_value1), tf.squeeze(q_value2) target = self.compute_target(next_state, done, reward, 1, gamma) target = tf.stop_gradient(target) td_loss1, td_loss2 = q_value1 - target, q_value2 - target huber_td1, huber_td2 = huber_loss(td_loss1), huber_loss(td_loss2) mean_td1, mean_td2 = tf.reduce_mean( huber_td1 * weights), tf.reduce_mean(huber_td2 * weights) self.update_metrics('TD1', mean_td1), self.update_metrics( 'TD2', mean_td2) n_target = self.compute_target(n_state, n_done, n_reward, actual_n, gamma) n_target = tf.stop_gradient(n_target) ntd_loss1, ntd_loss2 = q_value1 - n_target, q_value2 - n_target huber_ntd1, huber_ntd2 = huber_loss(ntd_loss1), huber_loss( ntd_loss2) mean_ntd1, mean_ntd2 = tf.reduce_mean( huber_ntd1 * weights), tf.reduce_mean(huber_ntd2 * weights) self.update_metrics('nTD1', mean_ntd1), self.update_metrics( 'nTD2', mean_ntd2) l2 = tf.add_n(self.online_critic.losses) self.update_metrics('critic_l2', l2) critic_loss = mean_td1 + mean_td2 + mean_ntd1 + mean_ntd2 + l2 self.update_metrics('critic_loss', critic_loss) gradients = tape.gradient(critic_loss, critic_variables) for i, g in enumerate(gradients): self.update_metrics('Critic_Gradient_norm', tf.norm(g)) gradients[i] = tf.clip_by_norm(g, 10) self.q_optimizer.apply_gradients(zip(gradients, critic_variables)) priorities = tf.abs(ntd_loss1) return priorities
def nn_update(self, state, action, next_state, done, reward, n_state, n_done, n_reward, actual_n, weights, gamma, demo): print("Q-nn_update tracing") online_variables = self.online_model.trainable_variables with tf.GradientTape() as tape: q_values = self.online_model(state, training=True) margin = self.margin_loss(q_values, action, demo, weights) self.update_metrics('Margin', margin) q_value = take_vector_elements(q_values, action) target = self.compute_target(next_state, done, reward, 1, gamma) target = tf.stop_gradient(target) td_loss = q_value - target huber_td = huber_loss(td_loss) mean_td = tf.reduce_mean(huber_td * weights) self.update_metrics('TD', mean_td) n_target = self.compute_target(n_state, n_done, n_reward, actual_n, gamma) n_target = tf.stop_gradient(n_target) ntd_loss = q_value - n_target huber_ntd = huber_loss(ntd_loss) mean_ntd = tf.reduce_mean(huber_ntd * weights) self.update_metrics('nTD', mean_ntd) l2 = tf.add_n(self.online_model.losses) self.update_metrics('l2', l2) all_losses = mean_td + mean_ntd + margin + l2 self.update_metrics('all_losses', all_losses) gradients = tape.gradient(all_losses, online_variables) for i, g in enumerate(gradients): self.update_metrics('Gradient_norm', tf.norm(g)) gradients[i] = tf.clip_by_norm(g, 10) self.q_optimizer.apply_gradients(zip(gradients, online_variables)) priorities = tf.abs(ntd_loss) return priorities
def train(self, obs_t, act_t, rew_t, obs_tp, done_mask, importance_weights): if self.double_q: q_tp1_best_using_online_net = tf.argmax(self.double_q_function(obs_tp), axis=1) q_tp1_best = tf.reduce_sum(self.target_q_function(obs_tp) * tf.one_hot(q_tp1_best_using_online_net, self.n_actions), axis=1) else: q_tp1_best = tf.reduce_max(self.target_q_function(obs_tp), axis=1) q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best q_t_selected_target = tf.cast(rew_t, tf.float32) + tf.cast(self.gamma, tf.float32) * q_tp1_best_masked with tf.GradientTape() as tape: q_t_selected = tf.reduce_sum(self.q_function(obs_t) * tf.one_hot(act_t, self.n_actions), axis=1) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = tf_util.huber_loss(td_error) weighted_error = tf.reduce_mean(errors) grads = tape.gradient(weighted_error, self.qfunc_layers.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.qfunc_layers.trainable_variables)) return td_error, weighted_error
def build_train_att(make_obs_ph, q_func, num_actions, optimizer, mask_func, grad_norm_clipping=None, gamma=1.0, double_q=False, scope="deepq", reuse=None): act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 # Did not modify double_q if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: # modified for greedy action set building, add mask to q_tp1 actions_mask = mask_func(obs_tp1_input) q_tp1 = q_tp1 + actions_mask q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}