def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) else: q_input = tf.concat(obs_ph_n + act_ph_n, 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() #act_pd.mode() # q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, lstm_model, optimizer, args, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64, use_lstm=True, session=None, lstm_scope=None): with tf.variable_scope(scope, reuse=reuse): # ===================q network开始建图================= act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # 创建分布 act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # observation placeholder obs_ph_n = make_obs_ph_n # 创建obs placeholder # target q placeholder target_ph = tf.placeholder(tf.float32, [None], name="target") # 在运行时计算,然后传入,只跟loss有关 # 在这里进行dimension reduction if use_lstm: if args.shared_lstm: with tf.variable_scope(lstm_scope): observation_n = lstm_model(obs_ph_n, scope="lstm", reuse=True) else: with tf.variable_scope(scope, reuse=reuse): observation_n = lstm_model(obs_ph_n, scope="lstm", reuse=reuse) else: # observation_n = obs_ph_n # 所有智能体的obs和action observation_n = [tf.squeeze(o, 1) for o in obs_ph_n] with tf.variable_scope(scope, reuse=reuse): if local_q_func: q_input = tf.concat([observation_n[q_index], act_ph_n[q_index]], 1) else: q_input = tf.concat(observation_n + act_ph_n, 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] # 计算q值 q_func_vars = U.scope_vars( U.absolute_scope_name("q_func")) # q network网络参数 if use_lstm: lstm_func_vars = U.scope_vars( U.absolute_scope_name("lstm")) # lstm参数 q_loss = tf.reduce_mean(tf.square(q - target_ph)) # mse loss q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss # + 1e-3 * q_reg if use_lstm: optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + lstm_func_vars, grad_norm_clipping) else: optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # ===============q network建图结束===================== # 创建可调用函数 train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr], session=session) q_values = U.function(obs_ph_n + act_ph_n, q, session=session) # ==================target q network建图=============== target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # ===================target q network建图结束====================== # 创建可调用函数 update_target_q = make_update_exp(q_func_vars, target_q_func_vars, session=session) target_q_values = U.function(obs_ph_n + act_ph_n, target_q, session=session) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, p_scope, p_index, p_func, q_func, lstm_model, optimizer, args, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, use_lstm=True, session=None): with tf.variable_scope(scope, reuse=reuse): # placeholder # action placeholder, list of [batch_size, action_dim] act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # 创建action的分布用来采样 act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # observation placeholder obs_ph_n = make_obs_ph_n # 创建observation的placeholder, list of [batch_size, state_dim, time_step] if use_lstm: if args.shared_lstm: with tf.variable_scope(p_scope): observation_n = lstm_model(obs_ph_n, reuse=True, scope="lstm") lstm_vars = U.scope_vars(U.absolute_scope_name("lstm")) else: with tf.variable_scope(scope): observation_n = lstm_model(obs_ph_n, reuse=reuse, scope="lstm") lstm_vars = U.scope_vars(U.absolute_scope_name("lstm")) else: with tf.variable_scope(scope, reuse=reuse): # observation_n = obs_ph_n observation_n = [tf.squeeze(o, 1) for o in obs_ph_n ] # 所有智能体的obs, list of [batch_size, state_dim] p_input = observation_n[p_index] # 当前智能体的局部obs, [batch_size, state_dim] # p是多个actor公用的,q是每一个critic有一个 with tf.variable_scope(p_scope, reuse=reuse): # 计算局部p值,最后用来产生action, [batch_size, action_dim] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", reuse=True, num_units=num_units) # p函数的参数 p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution,Pd.logits act_pd = act_pdtype_n[p_index].pdfromflat(p) # # act_sample = act_pd.sample() # [batch_size, action_dim] p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # [None] # 更新action act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() # [batch_size, action] # 目标p值的参数 target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars, session=session) # 函数调用 with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): q_input = tf.concat(observation_n + act_input_n, 1) # 所有智能体的s和a, [batch_size, concat_dim] if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # 计算Q(s,a), [batch_size,] # loss pg_loss = -tf.reduce_mean(q) # policy gradient loss ??? loss = pg_loss + p_reg * 1e-3 # 使用每一个critic计算的loss都是不同的,第一次需要建图,以后就不需要了 # p网络的优化器。 if use_lstm: optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars + lstm_vars, grad_norm_clipping) else: optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # ============p network建图结束================= # 创建可以调用的函数,就是往里面喂数据 # train的调用函数,输入必须是list, train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr], session=session) return train, update_target_p
def q_train(make_common_obs_ph, make_sep_obs_ph_n, act_space_n, optimizer, args, q_index, q_func, lstm_model, cnn_model, lstm_scope=None, cnn_scope=None, use_lstm=True, use_cnn=True, reuse=None, session=None, scope="trainer", local_q_func=False, num_units=64, grad_norm_clipping=None): # reuse = False with tf.variable_scope(scope, reuse=reuse): # ===================q network开始建图================= act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # 创建分布 act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] # target q placeholder target_ph = tf.placeholder(tf.float32, [None], name="target") # 在运行时计算,然后传入,只跟loss有关 # observation placeholder common_obs_ph = make_common_obs_ph # 创建observation的placeholder, list of [batch_size, state_dim, time_step] sep_obs_ph_n = make_sep_obs_ph_n # if shared cnn, reuse=True if use_cnn: if cnn_scope is None: cnn_scope = scope length = common_obs_ph.shape[1] new_common_obs = [] with tf.variable_scope(cnn_scope, reuse=tf.AUTO_REUSE): for i in range(length): new_common_obs.append(cnn_model(common_obs_ph[:, i], scope="cnn")) new_common_obs = tf.stack(new_common_obs) cnn_vars = U.scope_vars(U.absolute_scope_name("cnn")) # lstm参数 else: new_common_obs = common_obs_ph # if shared lstm, reuse=True if use_lstm: if lstm_scope == None: lstm_scope = scope with tf.variable_scope(lstm_scope, reuse=tf.AUTO_REUSE): observation_n = lstm_model(new_common_obs, sep_obs_ph_n, reuse=reuse, scope="lstm") lstm_func_vars = U.scope_vars(U.absolute_scope_name("lstm")) # lstm参数 else: with tf.variable_scope(scope, reuse=reuse): # observation_n = obs_ph_n observation_n = tf.squeeze(new_common_obs + sep_obs_ph_n, 1) # reuse = False with tf.variable_scope(scope, reuse=reuse): if local_q_func: q_input = tf.concat([observation_n[q_index], act_ph_n[q_index]], 1) else: q_input = tf.concat(observation_n + act_ph_n, 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] # 计算q值 q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # q network网络参数 # if use_lstm: # lstm_func_vars = U.scope_vars(U.absolute_scope_name("lstm")) # lstm参数 q_loss = tf.reduce_mean(tf.square(q - target_ph)) # mse loss q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss # + 1e-3 * q_reg if use_lstm and use_cnn: optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + lstm_func_vars + cnn_vars, grad_norm_clipping) else: optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # ===============q network建图结束===================== # 创建可调用函数 train = U.function(inputs=[common_obs_ph] + sep_obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr], session=session) q_values = U.function([common_obs_ph] + sep_obs_ph_n + act_ph_n, q, session=session) # ==================target q network建图=============== target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # ===================target q network建图结束====================== # 创建可调用函数 update_target_q = make_update_exp(q_func_vars, target_q_func_vars, session=session) target_q_values = U.function([common_obs_ph] + sep_obs_ph_n + act_ph_n, target_q, session=session) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}