def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, agent_idx, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): """ :param make_obs_ph_n: :param act_space_n: :param agent_idx: :param p_func: in base maddpg code = mlp_model :param q_func: in base maddpg code = mlp_model :param optimizer: :param grad_norm_clipping: :param local_q_func: :param num_units: :param scope: :param reuse: :return: """ with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n] act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[agent_idx] p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[agent_idx].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[agent_idx] = act_pd.sample() #act_pd.mode() # q_input = tf.concat(obs_ph_n + act_input_n, 1) q = q_func(q_input, 1, scope="q_func" + str(1), reuse=True, num_units=num_units)[:, 0] loss = -tf.reduce_mean(q) + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=make_obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=act_sample) p_values = U.function([make_obs_ph_n[agent_idx]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[agent_idx].pdfromflat( target_p).sample() target_act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def __init__(self, input_space, act_space, scope, args): self.input_shape = input_space self.act_space = act_space self.scope = scope self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None self.optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) self.grad_norm_clipping = 0.5 with tf.variable_scope(self.scope): act_pdtype = make_pdtype(act_space) # act_ph = act_pdtype.sample_placeholder([None], name= "action") act_ph = tf.placeholder(tf.float32, shape=(None, 1)) if args.game == "RoboschoolPong-v1": obs_ph = tf.placeholder(tf.float32, shape=(None, input_space.shape[0])) elif args.game == "Pong-2p-v0": obs_ph = tf.placeholder(tf.float32, shape=(None, input_space.shape[0], input_space.shape[1], input_space.shape[2])) q_target = tf.placeholder(tf.float32, shape=(None, )) #build the world representation z z = conv_model(obs_ph, 20, scope="world_model") p_input = z p = mlp_model(p_input, 2, scope="p_func") p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) act_pd = act_pdtype.pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) q_input = tf.concat([z, act_sample], -1) q = mlp_model(q_input, 1, scope="q_func") q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) pg_loss = -tf.reduce_mean(q) q_loss = tf.reduce_mean(tf.square(q - q_target)) # q_reg = tf.reduce_mean(tf.square(q)) q_optimize_expr = U.minimize_and_clip(self.optimizer, q_loss, q_func_vars, self.grad_norm_clipping) p_loss = pg_loss + p_reg * 1e-3 p_optimize_expr = U.minimize_and_clip(self.optimizer, p_loss, p_func_vars, self.grad_norm_clipping) p_values = U.function([obs_ph], p) target_p = mlp_model(z, 2, scope="target_p_func") target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) target_q = mlp_model(q_input, 1, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) target_act_sample = act_pdtype.pdfromflat(target_p).sample() self.update_target_p = make_update_exp(p_func_vars, target_p_func_vars) self.update_target_q = make_update_exp(q_func_vars, target_q_func_vars) self.act = U.function(inputs=[obs_ph], outputs=act_sample) self.target_act = U.function(inputs=[obs_ph], outputs=target_act_sample) self.p_train = U.function(inputs=[obs_ph] + [act_ph], outputs=p_loss, updates=[p_optimize_expr]) self.q_train = U.function(inputs=[obs_ph] + [act_ph] + [q_target], outputs=q_loss, updates=[q_optimize_expr]) self.q_values = U.function([obs_ph] + [act_ph], q) self.target_q_values = U.function([obs_ph] + [act_ph], target_q)
def build_train(make_obs_ph, q_func, num_actions, num_action_streams, batch_size, optimizer_name, learning_rate, grad_norm_clipping=None, gamma=0.99, double_q=True, scope="deepq", reuse=None, loss_type="L2"): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int total number of sub-actions to be represented at the output num_action_streams: int specifies the number of action branches in action value (or advantage) function representation batch_size: int size of the sampled mini-batch from the replay buffer reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for deep Q-learning grad_norm_clipping: float or None clip graident norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q-Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. BDQ uses it. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select an action given an observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f, q_f = build_act(make_obs_ph, q_func, num_actions, num_action_streams, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # Set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None, num_action_streams], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # Q-network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # Target Q-network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) if double_q: selection_q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) else: selection_q_tp1 = q_tp1 num_actions_pad = num_actions // num_action_streams q_values = [] for dim in range(num_action_streams): selected_a = tf.squeeze( tf.slice(act_t_ph, [0, dim], [batch_size, 1])) # TODO better? q_values.append( tf.reduce_sum(tf.one_hot(selected_a, num_actions_pad) * q_t[dim], axis=1)) target_q_values = [] for dim in range(num_action_streams): selected_a = tf.argmax(selection_q_tp1[dim], axis=1) selected_q = tf.reduce_sum( tf.one_hot(selected_a, num_actions_pad) * q_tp1[dim], axis=1) masked_selected_q = (1.0 - done_mask_ph) * selected_q target_q = rew_t_ph + gamma * masked_selected_q target_q_values.append(target_q) if optimizer_name == "Adam": optimizer = tf.train.AdamOptimizer(learning_rate) else: assert False, 'unsupported optimizer ' + str(optimizer_name) if loss_type == "L2": loss_function = tf.square elif loss_type == "Huber": loss_function = U.huber_loss else: assert False, 'unsupported loss type ' + str(loss_type) stream_losses = [] for dim in range(num_action_streams): dim_td_error = q_values[dim] - tf.stop_gradient( target_q_values[dim]) dim_loss = loss_function(dim_td_error) # Scaling of learning based on importance sampling weights is optional, either way works stream_losses.append( tf.reduce_mean(dim_loss * importance_weights_ph)) # with scaling if dim == 0: td_error = tf.abs(dim_td_error) else: td_error += tf.abs(dim_td_error) mean_loss = sum(stream_losses) / num_action_streams optimize_expr = U.minimize_and_clip( optimizer, mean_loss, var_list=q_func_vars, total_n_streams=(num_action_streams), clip_val=grad_norm_clipping) optimize_expr = [optimize_expr] # Target Q-network parameters are periodically updated with the Q-network's update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=optimize_expr) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, q_f, train, update_target, {'q_values': q_values}