def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # 重用变量 # create distribtuions初始动作概率分布列表 act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n ] # 为所有agent的动作空间都创造一个动作概率分布类 # 类的集合 # set up placeholders obs_ph_n = make_obs_ph_n # 所有的agent观察到的环境信息 act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # 返回用于存放每个agent的动作的占位符集合,用于填充所有agent选择的动作[none]代表可以填入无数组数据 p_input = obs_ph_n[p_index] # 仅观察到自身周围环境 p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) # 建立神经网络,输出单元数为动作个数...这代码写的太呆了 输出每一个动作的值 p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # 获取该神经网络全部变量 # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() # 确定性动作叠加噪声进行探索,成为随机策略,得到一组act,作用未知 p_reg = tf.reduce_mean(tf.square( act_pd.flatparam())) # flatparam是所有动作的actor网络输出值的集合 # 猜测引入p_reg是因为预测其agent动作的需要 act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample( ) # 仅替换自己的动作输入,自己的动作来自于自己的policy网络输出 # 所以通过这一步将两个网络连接,通过q网络优化自己的policy网络 q_input = tf.concat(obs_ph_n + act_input_n, 1) # q输入所有的环境观察值与所有的agents采取的动作 # q的输入 if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # 这里是用的q_func由于reuse所以使用已经创建好的变量,即自己的q网络而不是再创建一个 # q_train,p_train属于同一个scope! # 策略优化目标 pg_loss = -tf.reduce_mean(q) # loss与p_reg均需要加-号进行优化 # 目标使q的均值最大,等于采样后的-reduce_mean最小 loss = pg_loss + p_reg * 1e-3 # 引入熵? # 梯度下降优化器节点表达式 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions可调用函数,批量使用session训练 train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # 依据自身观察给出确定性动作 p_values = U.function([obs_ph_n[p_index]], p) # 输出的是动作值集合 # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] policy = shana(env_spec=env, af=15, of=22, K=2, hidden_layer_sizes=(128, 128), qf=q_func, reg=0.001) act, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index], with_log_pis=True) act_input_n = act_ph_n + [] act_input_n[p_index] = act p_func_vars = policy.get_params_internal() q_input = tf.concat(obs_ph_n + act_input_n, 1) vf_input = tf.concat(obs_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] vf = q_func(vf_input, 1, scope="vf_func", reuse=True, num_units=num_units)[:, 0] vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func")) pg_loss = tf.reduce_mean(log_pi * tf.stop_gradient(log_pi - q + vf)) p_reg = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=policy.name) loss = pg_loss + p_reg vf_loss = 0.5 * tf.reduce_mean((vf - tf.stop_gradient(q - log_pi))**2) optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) mikoto = U.minimize_and_clip(optimizer, vf_loss, vf_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) misaka = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[mikoto]) # target network target_p = shana(env_spec=env, af=15, of=22, K=2, hidden_layer_sizes=(128, 128), qf=q_func, reg=0.001, name='target_policy') target_p_func_vars = target_p.get_params_internal() target_vf = q_func(vf_input, 1, scope="target_vf_func", num_units=num_units)[:, 0] target_vf_func_vars = U.scope_vars( U.absolute_scope_name("target_vf_func")) target_act_r, tar_log = target_p.actions_for( observations=obs_ph_n[p_index], with_log_pis=True) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) upvf = make_update_exp(vf_func_vars, target_vf_func_vars) target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_r) return policy.get_actions, train, misaka, update_target_p, upvf, { 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, num_outputs, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="coma_trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n # act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] act_ph_n = [ tf.placeholder(tf.int32, [None], name="action" + str(i)) for i in range(len(act_space_n)) ] # actor的输入为本地的obs p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="coma_p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) # 得到各个action的概率 act_sample = act_pd.sample() # sample操作即gumble softmax coma训练需要某个特定的动作,所以需要一个argmax操作 act_picked = [act.tolist().index(max(act)) for act in act_sample] p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # 为什么要加一个[] act_input_n = act_ph_n + [] # 动作概率分布 替换当前agent的动作 act_input_n[p_index] = act_picked q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, num_outputs, scope="coma_q_func", reuse=True, num_units=num_units) # 反事实基线 baseline = [ baseline_calculation(act_distribute, q_list) for act_distribute, q_list in zip(act_sample, q) ] # 根据真实采取的动作获得q actual_picked_q = [q_list[act] for act, q_list in zip(act_picked, q)] # 计算当前动作的q相对于反事实基线的差值 a = [q - b for q, b in zip(actual_picked_q, baseline)] pg_loss = -tf.reduce_mean(a) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="coma_target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def c_next(make_obs_ph, act_space, c_ph, c_next_func, num_constraints, optimizer, grad_norm_clipping, num_units=64, reuse=False, scope="c_next"): with tf.variable_scope(scope, reuse=reuse): # set up placeholders act_pdtype = make_pdtype(act_space[0]) obs_ph = make_obs_ph act_ph = act_pdtype.sample_placeholder([None], name="action") c_next_target_ph = [] for _ in range(num_constraints): c_next_target_ph.append( tf.placeholder(tf.float32, [None, 1], name="target" + str(_))) c_next_input = tf.concat(obs_ph, 1) gs_ = [] for _ in range(num_constraints): gs_.append( c_next_func(c_next_input, int((act_pdtype.param_shape()[0]) / 2), scope="c_next_func" + str(_), num_units=num_units)) c_ = [] # to be testified for _ in range(num_constraints): temp = c_ph[_] + tf.multiply(gs_[_], act_ph) c_.append(tf.reduce_sum(temp, -1)) c_next_vars = [ U.scope_vars(U.absolute_scope_name("c_next_func" + str(_))) for _ in range(num_constraints) ] diff = [(c_[_] - c_next_target_ph[_]) for _ in range(num_constraints)] c_next_loss = [ tf.reduce_mean(tf.square(diff[_])) for _ in range(num_constraints) ] optimize_expr = [ U.minimize_and_clip(optimizer, c_next_loss[_], c_next_vars[_], grad_norm_clipping) for _ in range(num_constraints) ] # Create callable functions train = [ U.function(inputs=[obs_ph] + [act_ph] + [c_ph[_]] + [c_next_target_ph[_]], outputs=c_next_loss[_], updates=[optimize_expr[_]]) ] c_next_values = [ U.function([obs_ph] + [act_ph] + [c_ph[_]], c_[_]) for _ in range(num_constraints) ] g_next_values = [ U.function([obs_ph], gs_[_]) for _ in range(num_constraints) ] return train, c_next_values, g_next_values
def p_train_adv(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] # changed sample = act_pd.sample() act_input_n[p_index] = sample q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] ## Modifications here ## Create values vector: auto solve rows by 1 column v = tf.tile([0.0], [tf.shape(sample)[0]]) # variable for value function for i in range(act_space_n[p_index].n): # create row tensor with ith element as 1, actions are one-hot a = np.zeros((1, act_space_n[p_index].n), dtype=np.float32) a[0, i] = 1 a = tf.convert_to_tensor(a) # tile this row tensor automatic number of times a = tf.tile(a, [tf.shape(sample)[0], 1]) act_input = act_ph_n + [] act_input[p_index] = tf.convert_to_tensor(a) q_input_tmp = tf.concat(obs_ph_n + act_input, 1) if local_q_func: q_input_tmp = tf.concat( [obs_ph_n[p_index], act_input_n[p_index]], 1) # add Q(a[i], s) * pi(a[i]) to value p_i = act_pd.logits[:, i] # tmp is q values for action i multiplied by probability of taking action i tmp = tf.multiply( q_func(q_input_tmp, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0], p_i) v = tf.add(v, tmp) a = tf.subtract(v, q) # loss is equal to advantage pg_loss = -tf.reduce_mean(a) ## Modifications end loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(name, make_obs_ph_n, adj_n, act_space_n, neighbor_n, p_index, p_func, q_func, num_adversaries, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=128, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] agent_n = len(obs_ph_n) vec_n = U.BatchInput([1, neighbor_n], name="vec").get() p_input1 = obs_ph_n[ 0:num_adversaries] if name == "adversaries" else obs_ph_n[ num_adversaries:agent_n] p_input2 = adj_n[0:num_adversaries] if name == "adversaries" else adj_n[ num_adversaries:agent_n] p_input3 = vec_n # call for actor network # act_space is not good!!!!!!!!!! p = p_func(p_input1, p_input2, p_input3, neighbor_n, num_adversaries if name == "adversaries" else (agent_n - num_adversaries), 5, scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = [] act_sample = [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): act_pd_temp = act_pdtype_n[i].pdfromflat( p[i - (0 if name == "adversaries" else num_adversaries)]) act_pd.append(act_pd_temp) act_sample.append(act_pd_temp.sample()) temp = [] for i in range(len(act_pd)): temp.append(act_pd[i].flatparam()) # Is this regularization method correct?????????????????????????????/ p_reg = tf.reduce_mean(tf.square(temp)) act_input_n = act_ph_n + [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): act_input_n[i] = act_sample[ i - (0 if name == "adversaries" else num_adversaries)] q_input = tf.concat(obs_ph_n + act_input_n, 1) q = [] q_reduce_mean = [] for a in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): index = a if name == "adversaries" else a - num_adversaries temp = q_func(q_input, 1, scope="q_func_%d" % index, reuse=True, num_units=num_units)[:, 0] q.append(temp) q_reduce_mean += temp pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + adj_n + [vec_n], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], outputs=act_sample, list_output=True) p_values = U.function( p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], p, list_output=True) # target network target_p = p_func(p_input1, p_input2, p_input3, neighbor_n, num_adversaries if name == "adversaries" else (agent_n - num_adversaries), 5, scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars, central=True) target_act_sample = [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): target_act_sample.append(act_pdtype_n[i].pdfromflat(target_p[i - ( 0 if name == "adversaries" else num_adversaries)]).sample()) target_act = U.function( inputs=p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], outputs=target_act_sample, list_output=True) return act, train, update_target_p, p_values, target_act
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): """ Q-Learning make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents q_index (int): Agent index number q_func (function): MLP Neural Network model for the agent. optimizer (function): Network Optimizer function grad_norm_clipping (float): Value by which to clip the norm of the gradient local_q_func (boolean): Flag for using local q function num_units (int): The number outputs for the layers of the model scope (str): The name of the scope reuse (boolean): Flag specifying whether to reuse the scope Returns: train (function): Training function for Q network update_target_q (function): Update function for updating Q network values q_debug (dict): Contains 'q_values' and 'target_q_values' of the Q network """ with tf.variable_scope(scope, reuse=reuse): # Create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # Set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # Viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) # loss = q_loss + 1e-3 * q_reg loss = q_loss optimize_expr = tf_util.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = tf_util.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = tf_util.function(obs_ph_n + act_ph_n, q) # Target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = tf_util.scope_vars( tf_util.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = tf_util.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, adversarial, adv_eps, adv_eps_s, num_adversaries, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] # if adversarial: # num_agents = len(act_ph_n) # if q_index < num_adversaries: # adv_rate = [adv_eps_s *(i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents)] # else: # adv_rate = [adv_eps_s *(i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents)] # print(" adv rate for q_index : ", q_index, adv_rate) # pg_loss = -tf.reduce_mean(target_q) # raw_perturb = tf.gradients(pg_loss, act_ph_n) # perturb = [adv_eps * tf.stop_gradient(tf.nn.l2_normalize(elem, axis = 1)) for elem in raw_perturb] # new_act_n = [perturb[i] + act_ph_n[i] if i != q_index # else act_ph_n[i] for i in range(len(act_ph_n))] # adv_q_input = tf.concat(obs_ph_n + new_act_n, 1) # target_q = q_func(adv_q_input, 1, scope ='target_q_func', reuse=True, num_units=num_units)[:,0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): #note(Daniel): We need to handle the change of the p_index somehow. hmm. We could just shuffle it I suppose? # The observations that is. I mean it's not the cleanest solution but we wouldn't have to change anything in here. # I'm gonna test that! with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] num_actions = int(act_pdtype_n[p_index].param_shape()[0]) p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network # Note(Daniel): Can we maybe skip this if we use the same network? Hmm probably should't # This is probably the same thing as target_q in dqn target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }, num_actions
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, shared_CNN, optimizer, make_obs_map_ph_n, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] obs_map_ph_n = make_obs_map_ph_n p_input = obs_ph_n[p_index] p_map_input = obs_map_ph_n[p_index] with tf.variable_scope(scope, reuse=None): # create distribtuions map_context_input = shared_CNN(p_map_input, p_index, scope="CNN") CNN_vars = U.scope_vars(U.absolute_scope_name("CNN")) # num_adversary=2 # if p_index<num_adversary: # map_context_input=shared_CNN(p_map_input,scope="CNN-adv") # CNN_vars=U.scope_vars("CNN-adv") # else: # map_context_input=shared_CNN(p_map_input,scope="CNN-age") # CNN_vars=U.scope_vars("CNN-age") p = p_func(tf.concat([p_input, map_context_input], 1), int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() #act_pd.mode() # q_input = tf.concat(obs_ph_n + act_input_n, 1) for i in range(len(obs_ph_n)): q_input = tf.concat([ q_input, shared_CNN(obs_map_ph_n[i], i, scope="agent_" + str(i) + "/CNN") ], 1) # for i in range(len(obs_ph_n)): # if i<num_adversary: # q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],scope="CNN-adv")],1) # else: # q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],scope="CNN-age")],1) # for i in range(len(obs_ph_n)): # q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1) # with tf.variable_scope(scope, reuse=None): with tf.variable_scope(scope, reuse=None): if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) with tf.variable_scope(scope, reuse=True): optimize_expr2 = U.minimize_and_clip(optimizer, loss, CNN_vars, grad_norm_clipping) # Create callable functions with tf.variable_scope(scope, reuse=None): train = U.function(inputs=obs_ph_n + act_ph_n + obs_map_ph_n, outputs=loss, updates=[optimize_expr, optimize_expr2]) act = U.function(inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index], obs_map_ph_n[p_index]], p) #p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(tf.concat([p_input, map_context_input], 1), int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function( inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, before_com_func, channel, after_com_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, beta=0.01, ibmac_com=True): with tf.variable_scope(scope, reuse=reuse): clip_threshold = 1 # 1, 5, 10 is_norm_training = tf.placeholder(tf.bool) is_inference = tf.placeholder(tf.bool) ibmac_nocom = not ibmac_com num_agents = len(make_obs_ph_n) # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(num_agents) ] hiddens_n = [ before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), num_units=num_units) for i in range(num_agents) ] before_com_vars_n = [ U.scope_vars(U.absolute_scope_name("before_com_{}".format(i))) for i in range(num_agents) ] hiddens_n_for_message = tf.concat([ before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), reuse=True, num_units=num_units) for i in range(num_agents) ], axis=1) hiddens_n_for_message = tf.stop_gradient(hiddens_n_for_message) channel_output = channel(hiddens_n_for_message, num_units * num_agents, scope="channel", num_units=num_units * num_agents) message_n, mu_message_n, logvar_message_n = [ tf.split(item, num_or_size_splits=num_agents, axis=1) for item in channel_output ] logvar_message_n = [ tf.clip_by_value(log, -10, 10) for log in logvar_message_n ] # constrain kl_loss not to be too large message_n = [ clip_message(message, clip_threshold, is_norm_training, is_inference) for message in message_n ] channel_vars_n = [U.scope_vars(U.absolute_scope_name("channel"))] if ibmac_nocom: print('no_com') p_n = [ after_com_func(hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] else: check_n = [hiddens_n[i] + message_n[i] for i in range(num_agents)] p_n = [ after_com_func(hiddens_n[i] + message_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] p_func_vars = [ U.scope_vars(U.absolute_scope_name("p_func_{}".format(i))) for i in range(num_agents) ] # wrap parameters in distribution act_pd_n = [ act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents) ] act_sample_n = [act_pd.sample() for act_pd in act_pd_n] p_reg_n = [ tf.reduce_mean(tf.square(act_pd.flatparam())) for act_pd in act_pd_n ] act_input_n_n = [act_ph_n + [] for _ in range(num_agents)] for i in range(num_agents): act_input_n_n[i][i] = act_pd_n[i].sample() q_input_n = [ tf.concat(obs_ph_n + act_input_n, 1) for act_input_n in act_input_n_n ] q_n = [ q_func(q_input_n[i], 1, scope="q_func_{}".format(i), reuse=True, num_units=num_units)[:, 0] for i in range(num_agents) ] pg_loss_n = [-tf.reduce_mean(q) for q in q_n] # # 0.25 # kl_loss_message_n = [2 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(0.5) - 0.5 for mu, log in # zip(mu_message_n, logvar_message_n)] # #1 # kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in # zip(mu_message_n, logvar_message_n)] # #5 # kl_loss_message_n = [1.0/50 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(5) - 0.5 for mu, log in # zip(mu_message_n, logvar_message_n)] #10 kl_loss_message_n = [ 1.0 / 200 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(10) - 0.5 for mu, log in zip(mu_message_n, logvar_message_n) ] entropy = [tf.exp(log) + 1.4189 for log in logvar_message_n] pg_loss = tf.reduce_sum(pg_loss_n) p_reg = tf.reduce_sum(p_reg_n) kl_loss_message = tf.reduce_mean(kl_loss_message_n) if ibmac_nocom: loss = pg_loss + p_reg * 1e-3 else: loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message kl_loss = U.function(inputs=obs_ph_n + act_ph_n + [is_norm_training, is_inference], outputs=kl_loss_message) var_list = [] var_list.extend(before_com_vars_n) if not ibmac_nocom: var_list.extend(channel_vars_n) var_list.extend(p_func_vars) var_list = list(itertools.chain(*var_list)) optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [is_norm_training, is_inference], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=act_sample_n) p_values = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=p_n) if not ibmac_nocom: check_values = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=check_n) channel_com = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=channel_output) check_mu = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=mu_message_n) check_log = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=logvar_message_n) else: check_values = lambda x: 0 channel_com = lambda x: 0 check_mu = lambda x: 0 check_log = lambda x: 0 # target network target_hiddens_n = [ before_com_func(obs_ph_n[i], num_units, scope="target_before_com_{}".format(i), num_units=num_units) for i in range(num_agents) ] target_before_com_vars = [ U.scope_vars( U.absolute_scope_name("target_before_com_{}".format(i))) for i in range(num_agents) ] target_hiddens_n_for_message = tf.concat([ before_com_func(obs_ph_n[i], num_units, scope="target_before_com_{}".format(i), reuse=True, num_units=num_units) for i in range(num_agents) ], axis=1) target_hiddens_n_for_message = tf.stop_gradient( target_hiddens_n_for_message) target_channel_output = channel(target_hiddens_n_for_message, num_units * num_agents, scope="target_channel", num_units=num_units * num_agents) target_message_n, target_mu_message_n, target_logvar_message_n = [ tf.split(item, num_or_size_splits=num_agents, axis=1) for item in target_channel_output ] target_channel_vars = [ U.scope_vars(U.absolute_scope_name("target_channel")) ] if ibmac_nocom: target_p_n = [ after_com_func(target_hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] else: target_p_n = [ after_com_func(target_hiddens_n[i] + target_message_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] # target_p_n = [after_com_func(tf.concat([target_hiddens_n[i],target_message_n[i]], axis=1), int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)] target_p_func_vars = [ U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i))) for i in range(num_agents) ] target_var_list = [] target_var_list.extend(target_before_com_vars) if not ibmac_nocom: target_var_list.extend(target_channel_vars) target_var_list.extend(target_p_func_vars) target_var_list = list(itertools.chain(*target_var_list)) update_target_p = make_update_exp(var_list, target_var_list) target_act_sample_n = [ act_pdtype_n[i].pdfromflat(target_p_n[i]).sample() for i in range(num_agents) ] target_act = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=target_act_sample_n) check_message_n = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=message_n) check_hiddens_n = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=hiddens_n) check_entropy = U.function(inputs=obs_ph_n + [is_norm_training, is_inference], outputs=entropy) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act, 'kl_loss': kl_loss, 'check_values': check_values, 'channel_com': channel_com, 'check_mu': check_mu, 'check_log': check_log, 'check_message_n': check_message_n, 'check_hiddens_n': check_hiddens_n, 'check_entropy': check_entropy }
def p_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, discrete_action=False, target_update_tau=0.001, use_global_state=False, share_weights=False): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] act_test_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n state_ph_n = make_state_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] if share_weights: # add agent id to input as layers share weights p_input = tf.concat([p_input, tf.tile(tf.eye(n_agents)[p_index:p_index+1], [tf.shape(p_input)[0], 1])], -1) print("ACTPDTYPE: {}".format(act_space_n)) print("PINDEX: {}".format(p_index)) p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", reuse=share_weights, num_units=num_units, constrain_out=True, discrete_action=discrete_action) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_test_pd = act_test_pdtype_n[p_index].pdfromflat(p, test=True) # NOTE: test=True during testing time act_sample = act_pd.sample() act_test_sample = act_test_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() if not use_global_state: q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) else: q_input = tf.concat(state_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([state_ph_n[p_index], act_input_n[p_index]], 1) if share_weights: # add agent id to input as layers share weights q_input = tf.concat([q_input, tf.tile(tf.eye(n_agents)[p_index:p_index+1], [tf.shape(q_input)[0], 1])], -1) q = q_func(q_input, 1, scope="q_func", reuse=share_weights, num_units=num_units, constrain_out=False, discrete_action=discrete_action)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions if not use_global_state: train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) else: train = U.function(inputs=state_ph_n + obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) act_test = U.function(inputs=[obs_ph_n[p_index]], outputs=act_test_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", reuse=share_weights, num_units=num_units, constrain_out=True, discrete_action=discrete_action) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars, target_update_tau) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, act_test, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def _p_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, q_lstm_on, p_lstm_on, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, q_debug=None, discrete_action=False, target_update_tau=0.001, use_global_state=False, share_weights=False): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] act_test_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n state_ph_n = make_state_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None, 1], name="action" + str(i)) for i in range(len(act_space_n))] q_res = 1 p_res = int(act_pdtype_n[p_index].param_shape()[0]) # for actor p_c_ph, p_h_ph = get_lstm_state_ph(name='p_', n_batches=None, num_units=num_units) p_c_ph_n, p_h_ph_n = [p_c_ph for i in range(len(obs_ph_n))], [p_h_ph for i in range(len(obs_ph_n))] # for critic q_c_ph, q_h_ph = get_lstm_state_ph(name='q_', n_batches=None, num_units=num_units) q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n))], [q_h_ph for i in range(len(obs_ph_n))] if p_lstm_on: if not use_global_state: p_input = tf.concat([obs_ph_n[p_index], p_c_ph, p_h_ph], -1) else: p_input = tf.concat([state_ph_n, p_c_ph, p_h_ph], -1) if share_weights: # add agent id to input as layers share weights p_input = tf.concat([p_input, tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1], [tf.shape(p_input)[0], 1]), 1)], -1) p, p_state_out = p_func(p_input, p_res, scope="p_func", num_units=num_units) else: if not use_global_state: p_input = obs_ph_n[p_index] else: p_input = state_ph_n[p_index] if share_weights: # add agent id to input as layers share weights p_input = tf.concat([p_input, tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1], [tf.shape(p_input)[0], 1]), 1)], -1) p = p_func(p_input, p_res, scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_test_pd = act_test_pdtype_n[p_index].pdfromflat(p, test=True) # NOTE: test=True during testing time act_sample = act_pd.sample() act_test_sample = act_test_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() # deal with central state obs_or_state = state_ph_n if use_global_state else obs_ph_n # need to check this -- need safety checks if q_lstm_on: q_input = tf.concat(obs_or_state + act_input_n + q_c_ph_n + q_h_ph_n, -1) # unclear + obs_ph_n if share_weights: # add agent id to input as layers share weights q_input = tf.concat([q_input, tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1], [tf.shape(q_input)[0], 1]), 1)], -1) q, _ = q_func(q_input, 1, scope="q_func", num_units=num_units, reuse=True) else: q_input = tf.concat(obs_or_state + act_input_n, -1) if share_weights: # add agent id to input as layers share weights q_input = tf.concat([q_input, tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1], [tf.shape(q_input)[0], 1]), 1)], -1) q = q_func(q_input, 1, scope="q_func", num_units=num_units, reuse=True) q = q[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) act_test = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=[act_test_sample, p_state_out]) # Create callable functions obs_or_state_lst = state_ph_n + obs_ph_n if p_lstm_on and q_lstm_on: train = U.function(inputs=obs_or_state_lst + act_ph_n + q_c_ph_n + q_h_ph_n + p_c_ph_n + p_h_ph_n, outputs=loss, updates=[optimize_expr]) elif p_lstm_on: train = U.function(inputs=obs_or_state_lst + act_ph_n + p_c_ph_n + p_h_ph_n, outputs=loss, updates=[optimize_expr]) elif q_lstm_on: train = U.function(inputs=obs_or_state_lst + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=loss, updates=[optimize_expr]) else: train = U.function(inputs=obs_or_state_lst + act_ph_n, outputs=loss, updates=[optimize_expr]) if p_lstm_on: act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=[act_sample, p_state_out]) p_values = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=p) # target network target_p, target_p_state_out = p_func(p_input, p_res, scope="target_p_func", num_units=num_units) else: act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function(inputs=[obs_ph_n[p_index]], outputs=p) # target network target_p = p_func(p_input, p_res, scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_pd = act_pdtype_n[p_index].pdfromflat(target_p) target_act_sample = target_pd.sample() if p_lstm_on: target_act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=target_act_sample) else: target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, act_test, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, index, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, ensemble_num=5): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] #不懂 # set up placeholders obs_ph_n = make_obs_ph_n #输入:观测 act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(index) + str(i)) for i in range(len(act_space_n)) ] #输出:行动 p_input = obs_ph_n[p_index] #这个智能体得到的观测 p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func" + str(index), num_units=num_units) #得到映射函数,是个全连接网络 p_func_vars = U.scope_vars(U.absolute_scope_name("p_func" + str(index))) #得到这个网络的所有参数 # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) #不懂 act_sample = act_pd.sample() #采样,得到一个动作输出(一个实数) p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) #将参数展为一维,计算模方 act_input_n = act_ph_n + [] #动作输入,是placeholder act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: #如果是局部Q函数 q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] #不懂,一个全连接网络 pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) #训练网络 act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) #输出的动作 p_values = U.function([obs_ph_n[p_index]], p) #不懂 # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func" + str(index), num_units=num_units) #现实网络的函数。一个全连接网络 target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func" + str(index))) #得到全连接网络的参数 update_target_p = make_update_exp(p_func_vars, target_p_func_vars) #更新现实网络的参数,以soft的形式,即每次更新一点点(动量更新) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() #不懂 target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) #得到现实网络的动作 return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): """ Policy learning guided by Q-value Args: make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents p_index (int): Agent index number p_func (function): MLP Neural Network model for the agent. q_func (function): MLP Neural Network model for the agent. optimizer (function): Network Optimizer function grad_norm_clipping (float): Value by which to clip the norm of the gradient local_q_func (boolean): Flag for using local q function num_units (int): The number outputs for the layers of the model scope (str): The name of the scope reuse (boolean): Flag specifying whether to reuse the scope Returns: act (function): Action function for retrieving agent action. train (function): Training function for P network update_target_p (function): Update function for updating P network values p_debug (dict): Contains 'p_values' and 'target_act' of the P network """ with tf.variable_scope(scope, reuse=reuse): # Create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # Set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("p_func")) # Wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = tf_util.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = tf_util.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = tf_util.function([obs_ph_n[p_index]], p) # Target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = tf_util.scope_vars( tf_util.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, u_func, optimizer, optimizer_lamda, exp_var_alpha=None, cvar_alpha=None, cvar_beta=None, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64, u_estimation=False, constrained=True, constraint_type=None, agent_type=None): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): if constrained: lamda_constraint = tf.get_variable( 'lamda_constraint' + str(q_index), [1], initializer=tf.constant_initializer(1.0), dtype=tf.float32) if not constrained or constraint_type == "CVAR": v_constraint = tf.get_variable( 'v_constraint' + str(q_index), [1], initializer=tf.constant_initializer(1.0), dtype=tf.float32) # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") if u_estimation: target_ph_u = tf.placeholder(tf.float32, [None], name="target_u") rew = tf.placeholder(tf.float32, [None], name="reward") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] if u_estimation: u_input = tf.concat(obs_ph_n + act_ph_n, 1) u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0] u_loss = tf.reduce_mean( tf.square( tf.square(rew) + 2 * tf.multiply(rew, target_ph) + target_ph_u - u)) var = u - tf.square(q) else: var = tf.square(rew + target_ph) - tf.square(q) if not constrained or constraint_type == "CVAR": cvar = v_constraint + (1.0 / (1.0 - cvar_beta)) * tf.reduce_mean( tf.nn.relu(q - v_constraint)) cvar_loss = tf.reduce_mean(cvar) if constrained: if constraint_type == "Exp_Var": #print ('In constraint generation with lamda alpha') constraint = lamda_constraint * (var - exp_var_alpha) q_loss = tf.reduce_mean( tf.square((target_ph + rew + constraint) - q)) elif constraint_type == "CVAR": constraint = lamda_constraint * (cvar_alpha - cvar) q_loss = tf.reduce_mean( tf.square((target_ph + rew + constraint) - q)) else: q_loss = tf.reduce_mean(tf.square(q - (target_ph + rew))) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) if u_estimation: u_func_vars = U.scope_vars(U.absolute_scope_name("u_func")) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) train3 = None if u_estimation: loss = q_loss + u_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + u_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=[q_loss, u_loss], updates=[optimize_expr]) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=var) else: #print ('in loss minimization over q_func_vars') loss = q_loss optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=q_loss, updates=[optimize_expr]) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=var) #loss = loss + 1e-4*q_reg # Create callable functions q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) if u_estimation: u_values = U.function(obs_ph_n + act_ph_n, u) target_u = u_func(u_input, 1, scope="target_u_func", num_units=num_units)[:, 0] target_u_func_vars = U.scope_vars( U.absolute_scope_name("target_u_func")) update_target_u = make_update_exp(u_func_vars, target_u_func_vars) target_u_values = U.function(obs_ph_n + act_ph_n, target_u) if constrained: loss2 = -loss #print ('in loss maximisation over lamda') optimize_expr2 = U.minimize_and_clip(optimizer_lamda, loss2, [lamda_constraint], grad_norm_clipping) if u_estimation: train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=loss2, updates=[optimize_expr2]) else: train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=loss2, updates=[optimize_expr2]) if not constrained or constraint_type == "CVAR": loss = cvar_loss optimize_expr3 = U.minimize_and_clip(optimizer, loss, [v_constraint], grad_norm_clipping) train3 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=loss, updates=[optimize_expr3]) cvar_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=cvar) if not u_estimation: update_target_u = None target_u_values = None u_values = None if not constrained: train2 = None lamda_constraint = None if constraint_type != "CVAR" and constrained: cvar_fn = None v_constraint = None return train, train2, train3, update_target_q, update_target_u, { 'q_values': q_values, 'u_values': u_values, 'target_q_values': target_q_values, 'target_u_values': target_u_values, 'var': var_fn, 'cvar': cvar_fn, 'lamda_constraint': lamda_constraint, 'v_constraint': v_constraint, 'optimize_expr': optimize_expr }
def _p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, q_lstm_on, p_lstm_on, centralized_p, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, q_debug=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None, 1], name="action" + str(i)) for i in range(len(act_space_n)) ] q_res = 1 p_res = int(act_pdtype_n[p_index].param_shape()[0]) # for actor p_c_ph, p_h_ph = get_lstm_state_ph(name='p_', n_batches=None, num_units=num_units) p_c_ph_n, p_h_ph_n = [p_c_ph for i in range(len(obs_ph_n)) ], [p_h_ph for i in range(len(obs_ph_n))] # for critic q_c_ph, q_h_ph = get_lstm_state_ph(name='q_', n_batches=None, num_units=num_units) q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n)) ], [q_h_ph for i in range(len(obs_ph_n))] if p_lstm_on: p_input = tf.concat([obs_ph_n[p_index], p_c_ph, p_h_ph], -1) p, p_state_out = p_func(p_input, p_res, scope="p_func", num_units=num_units) else: p_input = obs_ph_n[p_index] p = p_func(p_input, p_res, scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() # need to check this -- need safety checks if q_lstm_on: q_input = tf.concat(obs_ph_n + act_input_n + q_c_ph_n + q_h_ph_n, -1) q, _ = q_func(q_input, 1, scope="q_func", num_units=num_units, reuse=True) else: q_input = tf.concat(obs_ph_n + act_input_n, -1) q = q_func(q_input, 1, scope="q_func", num_units=num_units, reuse=True) q = q[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions if p_lstm_on and q_lstm_on: train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n + p_c_ph_n + p_h_ph_n, outputs=loss, updates=[optimize_expr]) elif p_lstm_on: train = U.function(inputs=obs_ph_n + act_ph_n + p_c_ph_n + p_h_ph_n, outputs=loss, updates=[optimize_expr]) elif q_lstm_on: train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=loss, updates=[optimize_expr]) else: train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) if p_lstm_on: act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=[act_sample, p_state_out]) p_values = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=p) # target network target_p, target_p_state_out = p_func(p_input, p_res, scope="target_p_func", num_units=num_units) else: act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function(inputs=[obs_ph_n[p_index]], outputs=p) # target network target_p = p_func(p_input, p_res, scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_pd = act_pdtype_n[p_index].pdfromflat(target_p) target_act_sample = target_pd.sample() if p_lstm_on: target_act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=target_act_sample) else: target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, adversarial, adv_eps, adv_eps_s, num_adversaries, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) # if adversarial: # num_agents = len(act_input_n) # if p_index < num_adversaries: # adv_rate = [adv_eps_s *(i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents)] # else: # adv_rate = [adv_eps_s *(i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents)] # print(" adv rate for p_index : ", p_index, adv_rate) # raw_perturb = tf.gradients(pg_loss, act_input_n) # perturb = [tf.stop_gradient(tf.nn.l2_normalize(elem, axis = 1)) for elem in raw_perturb] # perturb = [perturb[i] * adv_rate[i] for i in range(num_agents)] # new_act_n = [perturb[i] + act_input_n[i] if i != p_index # else act_input_n[i] for i in range(len(act_input_n))] # adv_q_input = tf.concat(obs_ph_n + new_act_n, 1) # adv_q = q_func(adv_q_input, 1, scope = "q_func", reuse=True, num_units=num_units)[:,0] # pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def _q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, q_lstm_on, p_lstm_on, centralized_p, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None, 1], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None, 1], name="target") q_res = 1 p_res = int(act_pdtype_n[q_index].param_shape()[0]) # for actor p_c_ph, p_h_ph = get_lstm_state_ph(name='p_', n_batches=None, num_units=num_units) p_c_ph_n, p_h_ph_n = [p_c_ph for i in range(len(obs_ph_n)) ], [p_h_ph for i in range(len(obs_ph_n))] # for critic q_c_ph, q_h_ph = get_lstm_state_ph(name='q_', n_batches=None, num_units=num_units) q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n)) ], [q_h_ph for i in range(len(obs_ph_n))] if q_lstm_on: q_input = tf.concat(obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, -1) q, q_state_out = q_func(q_input, 1, scope="q_func", num_units=num_units) else: q_input = tf.concat(obs_ph_n + act_ph_n, -1) q = q_func(q_input, 1, scope="q_func", num_units=num_units) q = q[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions if q_lstm_on: q_values = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=[q, q_state_out]) train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) target_q, target_q_state_out = q_func(q_input, 1, scope="target_q_func", num_units=num_units) else: q_values = U.function(inputs=obs_ph_n + act_ph_n, outputs=q) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units) target_q = target_q[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) if q_lstm_on: target_q_values = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=target_q) else: target_q_values = U.function(inputs=obs_ph_n + act_ph_n, outputs=target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def q_train(name, make_obs_ph_n, adj_n, act_space_n, num_adversaries, neighbor_n, q_func, agent_n, optimizer, grad_norm_clipping=None, local_q_func=False, reuse=None, scope="trainer", num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # number of agents in this species agent_n_species = num_adversaries if name == "adversaries" else agent_n - num_adversaries # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = [ tf.placeholder(tf.float32, [None], name="target") for _ in range(agent_n_species) ] q = [] q_square = [] q_input = tf.concat(obs_ph_n + act_ph_n, 1) for a in range(agent_n_species): temp = q_func(q_input, 1, scope="q_func_%d" % a, num_units=num_units)[:, 0] q.append(temp) # q1 = tf.stack([q[i] for i in range(agent_n_species)], axis=1) # q_square = [tf.square(tf.reduce_mean(q[i] - target_ph[i], axis=1)) for i in range(agent_n_species)] q_func_vars = [ U.scope_vars(U.absolute_scope_name("q_func_%d" % i)) for i in range(agent_n_species) ] q_loss = [ tf.reduce_mean(tf.square(q[i] - target_ph[i])) for i in range(agent_n_species) ] # viscosity solution to Bellman differential equation in place of an initial condition # q_reg = tf.reduce_mean(tf.square(q1)) loss = q_loss # + 1e-3 * q_reg optimize_expr = [ U.minimize_and_clip(optimizer, loss[i], q_func_vars[i], grad_norm_clipping) for i in range(agent_n_species) ] # Create callable functions train = [ U.function(inputs=obs_ph_n + act_ph_n + [target_ph[i]], outputs=loss[i], updates=[optimize_expr[i]]) for i in range(agent_n_species) ] q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = [] for a in range(agent_n_species): temp = q_func(q_input, 1, scope="target_q_func_%d" % a, num_units=num_units)[:, 0] target_q.append(temp) target_q_func_vars = [ U.scope_vars(U.absolute_scope_name("target_q_func_%d" % i)) for i in range(agent_n_species) ] update_target_q = make_update_exp(q_func_vars, target_q_func_vars, central=False) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, q_values, target_q_values
def q_train(make_obs_ph_n, make_meesages_ph_n, act_space_n, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): num_agents = len(make_obs_ph_n) # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n messages_ph_n = make_meesages_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action_{}".format(i)) for i in range(len(act_space_n)) ] target_ph_n = [ tf.placeholder(tf.float32, [None], name="target_{}".format(i)) for i in range(num_agents) ] q_input = tf.concat(obs_ph_n + messages_ph_n + act_ph_n, 1) q_n = [ q_func(q_input, 1, scope="q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents) ] q_func_vars = [ U.scope_vars(U.absolute_scope_name("q_func_{}".format(i))) for i in range(num_agents) ] q_loss_n = [ tf.reduce_mean(tf.square(q - target_ph)) for q, target_ph in zip(q_n, target_ph_n) ] # viscosity solution to Bellman differential equation in place of an initial condition # q_reg = tf.reduce_mean(tf.square(q)) q_loss = tf.reduce_sum(q_loss_n) loss = q_loss # + 1e-3 * q_reg var_list = list(itertools.chain(*q_func_vars)) optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n + target_ph_n, outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n, q_n) # target network target_q_n = [ q_func(q_input, 1, scope="target_q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents) ] target_q_func_vars = [ U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i))) for i in range(num_agents) ] traget_var_list = list(itertools.chain(*target_q_func_vars)) update_target_q = make_update_exp(var_list, traget_var_list) target_q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n, target_q_n) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train_recurrent(make_obs_ph_n, make_state_ph_n, make_obs_next_n, make_obs_pred_n, act_space_n, p_index, p_policy, p_predict, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # set up placeholders obs_ph_n = make_obs_ph_n # all obs, in shape Agent_num * batch_size * time_step * obs_shape obs_next_n = make_obs_next_n state_ph_n = make_state_ph_n obs_pred_n = make_obs_pred_n # used for action act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # p_input is local obs of an agent obs_input = obs_ph_n[p_index] state_input = state_ph_n[p_index] act_input = act_ph_n[p_index] obs_next = obs_next_n[p_index] obs_pred_input = obs_pred_n[p_index] # get output and state p, gru_out, state = p_policy( obs_input, state_input, obs_pred_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_policy", num_units=num_units) act_pd = act_pdtype_n[p_index].pdfromflat( p) # wrap parameters in distribution act_sample = act_pd.sample() # sample an action # predict the next obs obs_pred = p_predict(act_input, gru_out, int(obs_input.shape[1]), scope="p_predict", num_units=num_units) # variables for optimization p_func_vars = U.scope_vars( U.absolute_scope_name("p_policy")) + U.scope_vars( U.absolute_scope_name("p_predict")) pred_loss = tf.reduce_mean(tf.square(obs_next - obs_pred)) # predict loss p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # reg item # use critic net to get the loss about policy act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample( ) # only modify the action of this agent q_input = tf.concat( obs_ph_n + act_input_n, 1) # get the input for Q net (all obs + all action) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # get q values pg_loss = -tf.reduce_mean(q) # calculate loss to maximize Q values loss = pg_loss + p_reg * 1e-3 + pred_loss * 1e-3 optimize_expr = U.minimize_and_clip( optimizer, loss, p_func_vars, grad_norm_clipping) # update p Net parameters # Create callable functions # update P NET train = U.function(inputs=obs_ph_n + state_ph_n + act_ph_n + obs_next_n + obs_pred_n, outputs=loss, updates=[optimize_expr]) # return action and state step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=[act_sample] + [state] + [gru_out]) p_values = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=p) # target network target_p, target_gru_out, target_state = \ p_policy(obs_input, state_input, obs_pred_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_policy", num_units=num_units) target_obs_pred = p_predict(act_input, target_gru_out, int(obs_input.shape[1]), scope="target_p_predict", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_policy")) + \ U.scope_vars(U.absolute_scope_name("target_p_predict")) # update the parameters θ'i = τθi + (1 − τ)θ'i update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=[target_act_sample] + [target_state] + [target_gru_out]) # return predicted obs gru_temp = tf.placeholder(tf.float32, [None] + [num_units], name='gru_out') pred_temp = p_predict(act_input, gru_temp, int(obs_input.shape[1]), scope="p_predict", num_units=num_units) predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp], outputs=pred_temp) target_pred_temp = p_predict(act_input, gru_temp, int(obs_input.shape[1]), scope="target_p_predict", num_units=num_units) target_predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp], outputs=target_pred_temp) return step, predict, train, update_target_p, { 'p_values': p_values, 'target_step': target_step, 'target_predict': target_predict }
def p_train(make_obs_ph_n, make_meesages_ph_n, act_space_n, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, beta=0.01): with tf.variable_scope(scope, reuse=reuse): num_agents = len(make_obs_ph_n) # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(num_agents) ] messages_ph_n = make_meesages_ph_n # multi_head = pre_message(messages_ph_n) items = [ p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)], int(act_pdtype_n[i].param_shape()[0]), scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] p_n, message_n, mu_message_n, logvar_message_n = list(zip(*items)) logvar_message_n = [ tf.clip_by_value(log, -10, 10) for log in logvar_message_n ] # constrain kl_loss not to be too large p_func_vars = [ U.scope_vars(U.absolute_scope_name("p_func_{}".format(i))) for i in range(num_agents) ] # wrap parameters in distribution act_pd_n = [ act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents) ] act_sample_n = [act_pd.sample() for act_pd in act_pd_n] p_reg_n = [ tf.reduce_mean(tf.square(act_pd.flatparam())) for act_pd in act_pd_n ] act_input_n_n = [act_ph_n + [] for _ in range(num_agents)] for i in range(num_agents): act_input_n_n[i][i] = act_pd_n[i].sample() q_input_n = [ tf.concat(obs_ph_n + messages_ph_n + act_input_n, 1) for act_input_n in act_input_n_n ] q_n = [ q_func(q_input_n[i], 1, scope="q_func_{}".format(i), reuse=True, num_units=num_units)[:, 0] for i in range(num_agents) ] pg_loss_n = [-tf.reduce_mean(q) for q in q_n] kl_loss_message_n = [ 0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in zip(mu_message_n, logvar_message_n) ] kl_loss_message = tf.reduce_mean(kl_loss_message_n) pg_loss = tf.reduce_sum(pg_loss_n) p_reg = tf.reduce_sum(p_reg_n) loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message var_list = [] var_list.extend(p_func_vars) var_list = list(itertools.chain(*var_list)) optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=obs_ph_n + messages_ph_n, outputs=[act_sample_n, message_n]) p_values = U.function(inputs=obs_ph_n + messages_ph_n, outputs=p_n) # target network target_items = [ p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)], int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents) ] target_p_n, target_message_n, target_mu_message_n, target_logvar_message_n = list( zip(*target_items)) target_logvar_message_n = [ tf.clip_by_value(log, -10, 10) for log in target_logvar_message_n ] # constrain kl_loss not to be too large target_p_func_vars = [ U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i))) for i in range(num_agents) ] target_var_list = [] target_var_list.extend(target_p_func_vars) target_var_list = list(itertools.chain(*target_var_list)) update_target_p = make_update_exp(var_list, target_var_list) target_act_sample_n = [ act_pdtype_n[i].pdfromflat(target_p_n[i]).sample() for i in range(num_agents) ] target_act = U.function( inputs=obs_ph_n + messages_ph_n, outputs=[target_act_sample_n, target_message_n]) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, u_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64, u_estimation=False): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") rew = tf.placeholder(tf.float32, [None], name="reward") if u_estimation: target_ph_u = tf.placeholder(tf.float32, [None], name="target_u") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) if u_estimation: u_input = tf.concat(obs_ph_n + act_ph_n, 1) u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0] u_loss = tf.reduce_mean( tf.square( tf.square(rew) + 2 * tf.multiply(rew, target_ph) + target_ph_u - u)) var = u - tf.square(q) else: var = tf.square(rew + target_ph) - tf.square(q) if u_estimation: u_func_vars = U.scope_vars(U.absolute_scope_name("u_func")) q_loss = tf.reduce_mean(tf.square(q - (rew + target_ph))) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) if u_estimation: loss = q_loss + u_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + u_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=[q_loss, u_loss], updates=[optimize_expr]) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=var) else: loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=var) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) if u_estimation: u_values = U.function(obs_ph_n + act_ph_n, u) target_u = u_func(u_input, 1, scope="target_u_func", num_units=num_units)[:, 0] target_u_func_vars = U.scope_vars( U.absolute_scope_name("target_u_func")) update_target_u = make_update_exp(u_func_vars, target_u_func_vars) target_u_values = U.function(obs_ph_n + act_ph_n, target_u) if u_estimation: return train, update_target_q, update_target_u, { 'q_values': q_values, 'u_values': u_values, 'var': var_fn, 'target_q_values': target_q_values, 'target_u_values': target_u_values } else: return train, update_target_q, { 'q_values': q_values, 'var': var_fn, 'target_q_values': target_q_values }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # make_ob_ph_n是输入的placeholder,与obs_n同shape act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # 获取概率类型,传入动作维度(5) # act_space来自于env.act_space,由实验环境决定 # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # 一维输入占位符 # 以上为三个placeholder, [None]增加维度,不知道喂进去多少数据时使用, 即None是batchsize大小 q_input = tf.concat(obs_ph_n + act_ph_n, 1) # q函数输入网络为动作加上环境,在1维上,即q网络输入是所有agent观察和动作 if local_q_func: # 用ddpg时即只用自己的行为训练 q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] # 取所有行的第0个数据 q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # q网络变量集合 q_loss = tf.reduce_mean( tf.square(q - target_ph)) # target_ph 会被什么占据呢? 会被喂进去的td target占据 # q网络的损失函数,均方差,target_ph来自于target网络的预测 # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss # + 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # 优化器表达式,以及是否梯度clip # Create callable functions # theano function train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) # 以下返回值均为theano function可以直接填入传入placeholder的参数 return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] #q_func是一个函数 其输出为全连接网络的输出,即q q_func_vars = U.scope_vars( U.absolute_scope_name("q_func")) #得到函数中的参数(全连接的参数) q_loss = tf.reduce_mean(tf.square(q - target_ph)) #定义平方损失,这是critic中的DQN的损失函数 # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg #类似参数衰减,防止过拟合 optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) #将输入到输出打包为一个函数 q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] #目标Q网络,用于计算Q现实,不必训练参数,每隔一段时间从q网络复制参数 target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) #得到目标Q网络的参数 update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) #将这个网络打包为一个函数,调用这个函数就可以方便地计算Q现实 return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=256): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] act_pdtype_n = [ SoftCategoricalPdType(act_space_n[q_index]) for act_space in act_space_n ] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def __init__(self, n_b_agent, a_dim, s_dim, a_bound=1, gamma=0.95, tau=0.01, lr_a=1e-2, lr_c=1e-2, memory_size=100000, batch_size=64, scope=""): self.nb_agent = n_b_agent self.memory = np.zeros( (memory_size, s_dim * 2 * self.nb_agent + a_dim * self.nb_agent + 1 + 1), dtype=np.float32) self.pointer = 0 self.sess = tf.Session() self.memory_size = memory_size self.batch_size = batch_size self.memory_filled = 0 self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, self.S = tf.placeholder(tf.float32, [None, s_dim], 's') self.total_a = tf.placeholder(tf.float32, [None, self.a_dim * self.nb_agent], 'a') self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') self.total_a_ = tf.placeholder(tf.float32, [None, self.a_dim * self.nb_agent], 'a_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.D = tf.placeholder(tf.float32, [None, 1], 'done') self.scope = scope with tf.variable_scope('Actor'): self.a, self.pre_a = self._build_a(self.S, scope='eval', trainable=True) self.a_, *_ = self._build_a(self.S_, scope='target', trainable=False) with tf.variable_scope('Critic'): # assign self.a = a in memory when calculating q for td_error, # otherwise the self.a is from Actor when updating Actor q = self._build_c(self.S, self.total_a, scope='eval', trainable=True) q_ = self._build_c(self.S_, self.total_a_, scope='target', trainable=False) # networks parameters prefix = (self.scope + "/") if len(self.scope) > 0 else "" self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix + 'Actor/eval') self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix + 'Actor/target') self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix + 'Critic/eval') self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix + 'Critic/target') # target net replacement self.soft_replace = [ tf.assign(t, (1 - tau) * t + tau * e) for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params) ] q_target = self.R + (1. - self.D) * gamma * q_ # in the feed_dic for the td_error, the self.a should change to actions in memory td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) # self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, var_list=self.ce_params) optimizer = tf.train.AdamOptimizer(lr_c) self.ctrain = U.minimize_and_clip(optimizer, td_error, self.ce_params, .5) a_reg = tf.reduce_mean(tf.reduce_sum(tf.square(self.pre_a), axis=-1)) a_loss = -tf.reduce_mean(q) + 1e-3 * a_reg # maximize the q # self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params) optimizer = tf.train.AdamOptimizer(lr_a) self.atrain = U.minimize_and_clip(optimizer, a_loss, self.ae_params, .5) self.sess.run(tf.global_variables_initializer())