def __init__(self, num_features, num_actions, timestep, action_space, scope): self.scope = scope self._lr = 0.5 self.discount = 1. self.replay_buffer = ReplayBuffer(1e4) with tf.variable_scope(self.scope): self.act_trajectory = tf.placeholder(tf.float32, shape = ((None, timestep, action_space))) self.target = tf.placeholder(tf.float32, shape = ((None, ))) self.act = tf.placeholder(tf.int32, shape = ((None,))) self.tau = lstm_model(self.act_trajectory, num_actions, scope = "tau_model_{}".format(scope)) self.q_input = self.tau #train network self.q = mlp_model(self.q_input, 2, scope = "q_model_{}".format(scope)) q_func_vars = U.scope_vars(U.absolute_scope_name( "q_model_{}".format(scope))) #target network self.target_q = mlp_model(self.q_input, 2, scope = "target_q_model_{}".format(scope)) target_q_func_vars = U.scope_vars(U.absolute_scope_name( "target_q_model_{}".format(scope))) # take action self.softmax = tf.nn.softmax(self.target_q) self.pred = tf.argmax(self.softmax, axis = 1) #calculate the loss self.q_t_selected = tf.reduce_mean(self.q * tf.one_hot(self.act, num_actions), 1) q_tp1_best = tf.reduce_max(self.q, 1) q_tp1_best_masked = q_tp1_best td_error = self.q_t_selected - tf.stop_gradient(self.target) self.errors = U.huber_loss(td_error) self.q_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.errors, var_list = q_func_vars) self.tau_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.tau, labels=self.act)) self.tau_opt_op = tf.train.AdamOptimizer(self._lr).minimize(self.tau_loss) self.get_pred = U.function(inputs = [self.act_trajectory] , outputs = [self.softmax]) self.train_q = U.function(inputs = [self.act_trajectory] + [self.target] +[self.act] , outputs = [self.errors, self.q], updates = [self.q_opt_op]) self.train_tau = U.function(inputs =[ self.act] + [self.act_trajectory], outputs = [self.tau_loss], updates =[ self.tau_opt_op ]) self.update_model = make_update_exp(q_func_vars, target_q_func_vars)
def build_train(train_dequeue, num_training_steps, q_func, num_actions, optimizer, grad_norm_clipping=None, data_format=None, gamma=None, multi_step_n=1, double_q=True, scope="deepq", reuse=None, replay_buffer=None, prioritized_replay_eps=None, bellman_h=None, bellman_ih=None, use_temporal_consistency=True): with tf.variable_scope(scope, reuse=reuse): actor_num, obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, idxs = train_dequeue # q network evaluation q_t = q_func(obs_t_input, num_actions, scope="q_func", data_format=data_format) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input, num_actions, scope="target_q_func", data_format=data_format) target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input, num_actions, scope="q_func", reuse=True, data_format=data_format) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = tf.stop_gradient((1.0 - done_mask_ph) * q_tp1_best) # compute RHS of bellman equation q_t_selected_target = bellman_h(rew_t_ph + gamma**multi_step_n * bellman_ih(q_tp1_best_masked)) q_t_selected_target = tf.stop_gradient(q_t_selected_target) # compute the error (potentially clipped) td_error = q_t_selected - q_t_selected_target errors = U.huber_loss(td_error) # This TC component was used by Pohlen et. al. to allow higher discounting factors # It seems to slow down learning so I disabled for the demo, the authors claimed it improves asymptotic performance if use_temporal_consistency: q_tp1_best_using_online_net_masked = ( 1.0 - done_mask_ph) * tf.reduce_max(q_tp1_using_online_net, 1) tc_error = q_tp1_best_using_online_net_masked - q_tp1_best_masked errors = errors + U.huber_loss(tc_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # To avoid unnecessary copies between gpus we maintain a copy on actors GPU that is updated each iteration with tf.device('/gpu:0'): q_func(obs_t_input, num_actions, scope="read_q_func", data_format=data_format, reuse=True) read_q_func_vars = U.scope_vars( U.absolute_scope_name("read_q_func")) update_read_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(read_q_func_vars, key=lambda v: v.name)): update_read_expr.append(var_target.assign(var)) update_read_expr = tf.group(*update_read_expr) if replay_buffer: new_priorities = tf.abs(td_error) + prioritized_replay_eps update_priority = replay_buffer.assign(idxs, new_priorities) optimize_expr = tf.group([optimize_expr, update_priority]) with tf.control_dependencies([optimize_expr, update_read_expr]): train = tf.assign_add(num_training_steps, 1) return train, update_target_expr
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_dist_train(make_obs_ph, dist_func, num_actions, num_atoms, V_max, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=False, scope="deepq", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_dist_act(make_obs_ph, dist_func, num_actions, num_atoms, V_max, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # value distribution network evaluation v_dist_t = dist_func(obs_t_input.get(), num_actions, scope="dist_func", reuse=True) # reuse parameters from act v_dist_func_vars = U.scope_vars(U.absolute_scope_name("dist_func")) # v_dist_t is p(x_t, a) # target value distribution network evalution v_dist_tp1 = dist_func(obs_tp1_input.get(), num_actions, scope="target_dist_func") target_v_dist_func_vars = U.scope_vars( U.absolute_scope_name("target_dist_func")) # v_dist_tp1 is p(x_(t+1), a) # q scores for actions which we know were selected in the given state. # (0) Calculate p(x_t, a_t) # x_t is given by ob_t_input, and a_t is given by act_t_ph batch_size = tf.shape(obs_t_input.get())[0] v_index1 = tf.range(batch_size) * tf.shape(v_dist_t)[1] v_index1 = tf.tile(tf.reshape(v_index1, [batch_size, 1]), [1, num_atoms]) v_index2 = act_t_ph * num_atoms # (3, 5, 7) => (3* 51, 5* 51, 7* 51) v_index2 = tf.tile(tf.reshape(v_index2, [batch_size, 1]), [1, num_atoms]) v_index2 = v_index2 + tf.range(num_atoms) v_index = v_index1 + v_index2 v_index = tf.reshape(v_index, [-1]) v_dist_t_selected = tf.gather(tf.reshape(v_dist_t, [-1]), v_index) v_dist_t_selected = tf.reshape(v_dist_t_selected, [batch_size, num_atoms]) # => v_dist_t_selected is p(x_t, a_t) # (1) Calculate Q(X_(t+1), a) V_min = -V_max delta_z = (V_max - V_min) / (num_atoms - 1) q_tp1 = q_value(v_dist_tp1, num_atoms, num_actions, V_max, delta_z) # (2) Get argmax_a Q(X_(t+1), a) q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_act = U.argmax(q_tp1, axis=1) q_tp1_best_act = tf.cast(q_tp1_best_act, tf.int32) # q_tp1_best_act is a* at t+1 step. # (3) Extract P(x_(t+1), a*) v_tp_index1 = tf.range(batch_size) * tf.shape(v_dist_tp1)[1] v_tp_index1 = tf.tile(tf.reshape(v_tp_index1, [batch_size, 1]), [1, num_atoms]) v_tp_index2 = q_tp1_best_act * num_atoms # (3, 5, 7) => (3* 51, 5* 51, 7* 51) v_tp_index2 = tf.tile(tf.reshape(v_tp_index2, [batch_size, 1]), [1, num_atoms]) # Check 1 : tf.range broadcasting v_tp_index2 = v_tp_index2 + tf.range(num_atoms) v_tp_index = v_tp_index1 + v_tp_index2 v_tp_index = tf.reshape(v_tp_index, [-1]) v_dist_tp1_selected = tf.gather(tf.reshape(v_dist_tp1, [-1]), v_tp_index) v_dist_tp1_selected = tf.reshape(v_dist_tp1_selected, [batch_size, num_atoms]) # v_dist_tp1_selected is P(x_(t+1), a*) # (4) Make T_z, b_j, l, u in matrix form z = tf.tile( tf.reshape(tf.range(-V_max, V_max + delta_z, delta_z), [1, num_atoms]), [batch_size, 1]) r = tf.tile(tf.reshape(rew_t_ph, [batch_size, 1]), [1, num_atoms]) done = tf.tile(tf.reshape(done_mask_ph, [batch_size, 1]), [1, num_atoms]) T_z = r + z * gamma * (1 - done) T_z = tf.maximum( tf.minimum(T_z, V_max), V_min) # Restrict upper and lower value of T_z to V_max and V_min b = (T_z - V_min) / delta_z l, u = tf.floor(b), tf.ceil(b) l_id = tf.cast(l, tf.int32) u_id = tf.cast(u, tf.int32) # u, l are float, l_id, u_id are int32 v_dist_t_selected = tf.reshape(v_dist_t_selected, [-1]) # q_dist_tp1_selected = tf.reshape(q_dist_tp1_selected, [-1]) add_index = tf.range(batch_size) * num_atoms err = tf.zeros([batch_size]) for j in range(num_atoms): l_index = l_id[:, j] + add_index u_index = u_id[:, j] + add_index p_tl = tf.gather(v_dist_t_selected, l_index) p_tu = tf.gather(v_dist_t_selected, u_index) log_p_tl = tf.log(p_tl) log_p_tu = tf.log(p_tu) p_tp1 = v_dist_tp1_selected[:, j] err = err + p_tp1 * ((u[:, j] - b[:, j]) * log_p_tl + (b[:, j] - l[:, j]) * log_p_tu) # u_index = u_id[:, j] err = tf.negative(err) weighted_error = tf.reduce_mean(err) # q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # # compute RHS of bellman equation # q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # print np.shape(q_t_selected_target) # # compute the error (potentially clipped) # td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) # errors = U.huber_loss(td_error) # weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=v_dist_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=v_dist_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(v_dist_func_vars, key=lambda v: v.name), sorted(target_v_dist_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=weighted_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], v_dist_t) return act_f, train, update_target, {'q_dist_values': q_values}
def build_train(make_obs_ph, q_func, n_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """ :param make_obs_ph: str -> tf.placeholder a function that create a placeholder given that name :param q_func: input, n_actions, scope, reuse -> tf.Tensor the model that takes the following paramters: input: tf.placeholder n_actions: int, number of actions scope: str reuse: bool, whether to reuse the variables from the scope :param n_actions: number of actions :param optimizer: :param grad_norm_clipping: :param gamma: :param double_q: bool, whether to use double q value or not :param scope: :param reuse: :param param_noise: :param param_noise_filter_func: :return: a bunch of functions act_f: function to generate actions train_f: function to update the main network update_target_f: function used to update the target network {}: other useful functions """ if param_noise: raise NotImplemented() else: act_f = build_act(make_obs_ph, q_func, n_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): gamma = tf.constant(gamma, name="gamma") obs_t_ph = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int64, shape=[None], name="action") rew_t_ph = tf.placeholder(tf.float32, shape=[None], name="reward") obs_tp1_ph = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, shape=[None], name="done") weights_ph = tf.placeholder(tf.float32, shape=[None], name="weight") # q values q_t = q_func(obs_t_ph.get(), n_actions, scope="q_func", reuse=True) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q values q_target_tp1 = q_func(obs_tp1_ph.get(), n_actions, scope="q_target_func") q_target_vars = U.scope_vars(U.absolute_scope_name("q_target_func")) if double_q: q_tpl1 = q_func(obs_tp1_ph.get(), n_actions, scope='q_func', reuse=True) responsible_actions = tf.argmax(q_tpl1, axis=1) double_q_value = tf.reduce_sum( q_target_tp1 * tf.one_hot(responsible_actions, n_actions), axis=1) else: raise NotImplemented() double_q_value_masked = (1.0 - done_mask_ph) * double_q_value q_true_value = rew_t_ph + gamma * double_q_value_masked q_current_value = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, n_actions), axis=1) td_error = q_current_value - tf.stop_gradient(q_true_value) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(errors * weights_ph) if grad_norm_clipping is not None: train_op = U.minimize_and_clip(optimizer, weighted_error, q_func_vars, clip_val=grad_norm_clipping) else: train_op = optimizer.minimize(weighted_error, var_list=q_func_vars) with tf.variable_scope("update_target", reuse=False): update_target_ops = [] for qvar, qtarget_var in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(q_target_vars, key=lambda v: v.name)): update_target_ops.append(qtarget_var.assign(qvar)) update_target_network = tf.group(*update_target_ops) # create callable function train_f = U.make_function(inputs=[ obs_t_ph, act_t_ph, rew_t_ph, obs_tp1_ph, done_mask_ph, weights_ph ], outputs=td_error, updates=[train_op]) update_target_f = U.make_function([], [], updates=[update_target_network]) q_values_f = U.make_function([obs_t_ph], q_t) return act_f, train_f, update_target_f, {'q_values': q_values_f}