def __init__(self, act_space, mem_units=64, name=""): super().__init__(name="MDActorI" + name + "I") self.act_space = act_space self.act_pd = make_pdtype(act_space) self.pd = None self.output_layer = tf.keras.layers.Dense(act_space.n) self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2) self.encoder_units = 64 self.read_units = 20 # MUST BE EQUAL self.memory_units = mem_units self.write_units = self.memory_units self.action_units = 32 self.attention_units = 16 self.critic_units = 64 self.action_number = act_space.n self.encode_layer = tf.keras.layers.Dense(self.encoder_units, tf.nn.relu) self.read_projection = tf.keras.layers.Dense(self.read_units, tf.nn.relu) self.read_layer = tf.keras.layers.Dense(self.read_units) self.write_projection = tf.keras.layers.Dense(self.write_units, tf.nn.sigmoid) self.write_layer = tf.keras.layers.Dense(self.write_units, tf.nn.tanh) self.remember_layer = tf.keras.layers.Dense(self.memory_units, tf.nn.sigmoid) self.forget_layer = tf.keras.layers.Dense(self.memory_units, tf.nn.sigmoid) self.action_layer = tf.keras.layers.Dense(self.action_units, tf.nn.relu)
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def i_train(make_obs_ph_n, intent_ph_n, act_space_n, make_intent_ph_n, make_act_traj_ph_n, i_func, i_index,output_size , optimizer, scope, reuse, grad_norm_clipping=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] obs_ph_n = make_obs_ph_n #here the intent_ph can be used as the true actions, they are in the same shape intent_ph_n = make_intent_ph_n flat_act_traj_ph_n =[tf.reshape(a, (-1, a.shape[1] * a.shape[2] *a.shape[3])) for a in make_act_traj_ph_n] act_traj_ph_n = make_act_traj_ph_n i_input = [tf.concat([obs, act_traj], axis = 1) for obs, act_traj in zip(obs_ph_n, flat_act_traj_ph_n)] i = i_func(i_input[i_index], output_size, scope = "i_func", num_units = 64 ) i_func_vars = U.scope_vars(U.absolute_scope_name("i_func")) #define loss loss = tf.reduce_mean(tf.square(i - intent_ph_n[i_index])) optimize_expr = U.minimize_and_clip(optimizer, loss, i_func_vars, grad_norm_clipping) train = U.function(inputs= obs_ph_n + act_traj_ph_n + intent_ph_n, outputs=loss, updates=[optimize_expr]) i_values = U.function(inputs =[obs_ph_n[i_index]] + [act_traj_ph_n[i_index]], outputs = i) target_i = i_func(i_input, output_size, scope = "target_i_func", num_units = 64 ) target_i_func_vars = U.scope_vars(U.absolute_scope_name("target_i_func")) update_target_i = make_update_exp(i_func_vars, target_i_func_vars) target_i_values = U.function(inputs = [obs_ph_n[i_index]] +[act_traj_ph_n[i_index]], outputs = target_i) return i_values, train, update_target_i,{'i_values': i_values, 'target_i_values': target_i_values}
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] policy = shana( env_spec=env, af = 15, of = 22, K=2, hidden_layer_sizes=(128, 128), qf=q_func, reg=0.001 ) act, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index], with_log_pis=True) act_input_n = act_ph_n + [] p_func_vars = policy.get_params_internal() q_input = tf.concat(obs_ph_n + act_input_n, 1) vf_input = tf.concat(obs_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] vf = q_func(vf_input, 1, scope="vf_func",reuse=True, num_units=num_units)[:,0] vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func")) pg_loss = tf.reduce_mean(log_pi * tf.stop_gradient(log_pi - q + vf)) p_reg = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, scope=policy.name) loss = pg_loss + p_reg vf_loss = 0.5 * tf.reduce_mean((vf - tf.stop_gradient(q - log_pi))**2) optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) mikoto = U.minimize_and_clip(optimizer, vf_loss, vf_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) misaka = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[mikoto]) # target network target_p = shana( env_spec=env, af = 15, of = 22, K=2, hidden_layer_sizes=(128, 128), qf=q_func, reg=0.001, name = 'target_policy' ) target_p_func_vars = target_p.get_params_internal() target_vf = q_func(vf_input, 1, scope="target_vf_func", num_units=num_units)[:,0] target_vf_func_vars = U.scope_vars(U.absolute_scope_name("target_vf_func")) target_act_r, tar_log = target_p.actions_for(observations=obs_ph_n[p_index],with_log_pis=True) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) upvf = make_update_exp(vf_func_vars, target_vf_func_vars) target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_r) return policy.get_actions, train, misaka, update_target_p, upvf, {'target_act': target_act}
def pMA_train(make_obs_ph_n, make_memory_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, critic_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] # p_input = obs_ph_n[p_index] memory_input = make_memory_ph_n[p_index] num_agents = len(obs_ph_n) p_input = [None] * (num_agents + 1) for i in range(num_agents): p_input[i] = obs_ph_n[i] p_input[num_agents] = memory_input p, memory_state = p_func(obs_ph_n, memory_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="pMA_func", reuse=reuse) p_func_vars = U.scope_vars(U.absolute_scope_name("pMA_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="qMA_func", reuse=True, num_units=critic_units)[:,0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=p_input + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=p_input, outputs=act_sample) memory_out = U.function(inputs=p_input, outputs=memory_state) p_values = U.function(p_input, p) # target network target_p, target_memory = p_func(obs_ph_n, memory_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", reuse=reuse) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_pMA_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=p_input, outputs=target_act_sample) return act, memory_out, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def m_train(act_space_n, m_index, m_func, optimizer, mut_inf_coef=0, grad_norm_clipping=None, scope="trainer", reuse=None, num_units=64): return with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = 1 act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") m_input = [1] m = m_func(m_input, 1, scope="m_func", num_units=num_units)[:, 0] m_func_vars = U.scope_vars(U.absolute_scope_name("m_func")) m_loss = tf.reduce_mean(tf.square(m - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition m_reg = tf.reduce_mean(tf.square(m)) loss = m_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) m_values = U.function(obs_ph_n + act_ph_n, m) # target network target_m = m_func(m_input, 1, scope="target_m_func", num_units=num_units)[:, 0] target_m_func_vars = U.scope_vars( U.absolute_scope_name("target_m_func")) update_target_m = make_update_exp(m_func_vars, target_m_func_vars) target_m_values = U.function(obs_ph_n + act_ph_n, target_m) return train, update_target_m, { 'm_values': m_values, 'target_m_values': target_m_values }
def __init__(self, act_space, n_units=64, name=""): super().__init__(name="ActorI" + name + "I") self.base_model = MLP2_Model(n_units, tf.nn.relu, name) self.act_space = act_space self.act_pd = make_pdtype(act_space) self.pd = None self.output_layer = tf.keras.layers.Dense(act_space.n) self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # Discrete type for spread # SoftCategoricalPdType # SoftCategoricalPd act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] #print('act_pdtype_n:\n',act_pdtype_n) # set up placeholders obs_ph_n = make_obs_ph_n #print('obs_ph_n:\n', obs_ph_n) act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] #print('act_ph_n:\n', act_ph_n) p_input = obs_ph_n[p_index] p = p_func(inputs=p_input, num_outputs=int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units, num_layers=3) #print('p',p) #shape=(?, 2) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) #print('act_pd:\n', act_pd) act_sample = act_pd.sample() #print('act_sample 1111111:\n', act_sample) #shape=(?, 2) p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(inputs=q_input, num_outputs=1, scope="q_func", reuse=True, num_units=num_units, num_layers=3)[:,0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(inputs=p_input, num_outputs=int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units, num_layers=3) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n # [U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))] act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() # act_pd.mode() # q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) sync_target_p = make_update_exp(p_func_vars, target_p_func_vars, rate=1.0) target_act_pd = act_pdtype_n[p_index].pdfromflat(target_p) target_act_sample = target_act_pd.sample() target_act_mode = target_act_pd.mode() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) target_mode = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_mode) target_p_values = U.function([obs_ph_n[p_index]], target_p) return act, train, update_target_p, sync_target_p, {'p_values': p_values, 'target_p_values': target_p_values, 'target_mode': target_mode, 'target_act': target_act}
def q_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64, discrete_action=False, target_update_tau=0.001, use_global_state=False, share_weights=False): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders if not use_global_state: obs_ph_n = make_obs_ph_n else: obs_ph_n = make_state_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) if share_weights: # add agent id to input as layers share weights q_input = tf.concat([q_input, tf.tile(tf.eye(n_agents)[q_index:q_index+1], [tf.shape(q_input)[0], 1])], -1) q = q_func(q_input, 1, scope="q_func", reuse=share_weights, num_units=num_units, constrain_out=False, discrete_action=discrete_action)[:, 0] #share_weights)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", reuse=share_weights, num_units=num_units, constrain_out=False, discrete_action=discrete_action)[:, 0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars, target_update_tau) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def get_trainers(env, num_adversaries, obs_shape_n, arglist): from maddpg.common.distributions import make_pdtype trainers = [] act_info_n = [make_pdtype(ac) for ac in env.action_space] for i in range(env.n): local_q_func = (arglist.adv_policy == "ddpg") if i < num_adversaries else ( arglist.good_policy == "ddpg") trainer = MADDPG(obs_shape_n, act_info_n, i, arglist, local_q_func=local_q_func) trainers.append(trainer) return trainers
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] policy = shana( env_spec=env, af = 15, of = 22, K=2, hidden_layer_sizes=(100, 100), qf=q_func, reg=0.001 ) actions, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index], with_log_pis=True) print(actions) print(log_pi) p_func_vars = policy.get_params_internal() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] pg_loss = -tf.reduce_mean(q) p_reg = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, scope=policy.name) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def __init__(self, name, learning_rate, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): self.name = name self.learning_rate = learning_rate self.n = len(obs_shape_n) self.agent_index = agent_index self.obs_size = obs_shape_n[agent_index] self.joint_obs_size = np.sum(obs_shape_n) self.act_size = act_space_n[agent_index].n self.act_pdtype_n = [ make_pdtype(act_space) for act_space in act_space_n ] self.joint_act_size = 0 for i_act in act_space_n: self.joint_act_size += i_act.n self.args = args self.actor = Actor(self.obs_size, self.act_size) self.actor_target = Actor(self.obs_size, self.act_size) self.critic = self.build_critic() self.critic_target = self.build_critic() update_target(self.actor, self.actor_target, 0) update_target(self.critic, self.critic_target, 0) #self.actor, self.critic = self.build_model() #self.actor_target, self.critic_target = self.build_model() self.actor_optimizer = self.build_actor_optimizer() # Create experience buffer self.replay_buffer = ReplayBuffer(1e6) self.max_replay_buffer_len = args.batch_size * args.max_episode_len self.replay_sample_index = None gpu = -1 self.device = "/gpu:{}".format(gpu) if gpu >= 0 else "/cpu:0"
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() # Create callable functions act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) return act
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) # Gradient computation mods # --------------------------------------------------------------------------------------------- obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])] act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])] obs_flat_ph = tf.placeholder(tf.float32, shape=[None] + obs_flat_shape, name="obs_flat_input") act_flat_ph = tf.placeholder(tf.float32, shape=[None] + act_flat_shape, name="act_flat_input") q_vec_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1) serial_q = q_func(q_vec_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # calculate gradient of serial q value wrt actions raw_grad = tf.gradients(serial_q, act_flat_ph) grad_norm = tf.divide(raw_grad, tf.norm(raw_grad)) grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm) # --------------------------------------------------------------------------------------------- loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act, 'grad_norm_value': grad_norm_value }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # get flattened obs and act shape act_shape = tf.shape(act_ph_n) act_serial = tf.concat(act_ph_n, 1) act_serial = tf.reshape(act_serial, [act_shape[1], act_shape[0] * act_shape[-1]]) act_serial_values = U.function(act_ph_n, act_serial) obs_shape = tf.shape(obs_ph_n) obs_serial = tf.concat(obs_ph_n, 1) obs_serial = tf.reshape(obs_serial, [obs_shape[1], obs_shape[0] * obs_shape[-1]]) obs_serial_values = U.function(obs_ph_n, obs_serial) obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])] act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])] obs_flat_ph = tf.placeholder(tf.float32, shape=[None] + obs_flat_shape, name="obs_flat_input") act_flat_ph = tf.placeholder(tf.float32, shape=[None] + act_flat_shape, name="act_flat_input") target_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1) q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network # target_orig_q = q_func(q_input, 1, scope="target_orig_q_func", num_units=num_units)[:,0] target_q = q_func(target_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) # target_q_values = U.function(obs_ph_n + act_ph_n, target_q) target_q_values = U.function([obs_flat_ph, act_flat_ph], target_q) # calculate gradient of target q value wrt actions raw_grad = tf.gradients(target_q, act_flat_ph) grad_norm = tf.divide(raw_grad, tf.norm(raw_grad)) grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values, 'act_serial_values': act_serial_values, 'obs_serial_values': obs_serial_values, 'grad_norm_value': grad_norm_value }
def dqn_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func="dqn", num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() target_ph = tf.placeholder(tf.float32, [None], name="target") tf_p = tf.reduce_sum(p, reduction_indices=1) loss = tf.reduce_mean(tf.square(tf_p - target_ph)) optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, u_func, optimizer, optimizer_lamda, exp_var_alpha=None, cvar_alpha=None, cvar_beta=None, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64, u_estimation=False, constrained=True, constraint_type=None, agent_type=None): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): if constrained: lamda_constraint = tf.get_variable( 'lamda_constraint' + str(q_index), [1], initializer=tf.constant_initializer(1.0), dtype=tf.float32) if constraint_type == "CVAR": v_constraint = tf.get_variable( 'v_constraint' + str(q_index), [1], initializer=tf.constant_initializer(1.0), dtype=tf.float32) # create distribtuions act_pdtype_n = make_pdtype(act_space_n[q_index]) # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n.sample_placeholder([None], name="action0")] target_ph = tf.placeholder(tf.float32, [None], name="target") if u_estimation: target_ph_u = tf.placeholder(tf.float32, [None], name="target_u") rew = tf.placeholder(tf.float32, [None], name="reward") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[0], act_ph_n[0]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] if u_estimation: u_input = tf.concat(obs_ph_n + act_ph_n, 1) u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0] u_loss = tf.reduce_mean( tf.square( tf.square(rew) + 2 * tf.multiply(rew, target_ph) + target_ph_u - u)) var = u - tf.square(q) else: var = tf.square(rew + target_ph) - tf.square(q) if constrained: if constraint_type == "Exp_Var": #print ('In constraint generation with lamda alpha') constraint = lamda_constraint * (var - exp_var_alpha) q_loss = tf.reduce_mean( tf.square(q - (target_ph + rew - constraint))) elif constraint_type == "CVAR": cvar = v_constraint + (1.0 / (1.0 - cvar_beta)) * tf.reduce_mean( tf.nn.relu(q - v_constraint)) constraint = lamda_constraint * (cvar_alpha - cvar) q_loss = tf.reduce_mean( tf.square(q - (target_ph + rew - constraint))) else: q_loss = tf.reduce_mean(tf.square(q - (target_ph + rew))) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) if u_estimation: u_func_vars = U.scope_vars(U.absolute_scope_name("u_func")) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) train3 = None if u_estimation: loss = q_loss + u_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + u_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=[q_loss, u_loss], updates=[optimize_expr]) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=var) elif constraint_type == "CVAR": loss = q_loss optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + [v_constraint], grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=q_loss, updates=[optimize_expr]) cvar_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=cvar) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=var) else: #print ('in loss minimization over q_func_vars') loss = q_loss optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=q_loss, updates=[optimize_expr]) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=var) if not constrained: optimize_expr3 = U.minimize_and_clip(optimizer, loss, [v_constraint], grad_norm_clipping) train3 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=q_loss, updates=[optimize_expr3]) #loss = loss + 1e-4*q_reg # Create callable functions q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) if u_estimation: u_values = U.function(obs_ph_n + act_ph_n, u) target_u = u_func(u_input, 1, scope="target_u_func", num_units=num_units)[:, 0] target_u_func_vars = U.scope_vars( U.absolute_scope_name("target_u_func")) update_target_u = make_update_exp(u_func_vars, target_u_func_vars) target_u_values = U.function(obs_ph_n + act_ph_n, target_u) if constrained: loss2 = -loss #print ('in loss maximisation over lamda') optimize_expr2 = U.minimize_and_clip(optimizer_lamda, loss2, [lamda_constraint], grad_norm_clipping) if u_estimation: train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=loss2, updates=[optimize_expr2]) else: train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=loss2, updates=[optimize_expr2]) if not u_estimation: update_target_u = None target_u_values = None u_values = None if not constrained: train2 = None lamda_constraint = None if constraint_type != "CVAR": cvar_fn = None v_constraint = None return train, train2, train3, update_target_q, update_target_u, { 'q_values': q_values, 'u_values': u_values, 'target_q_values': target_q_values, 'target_u_values': target_u_values, 'var': var_fn, 'cvar': cvar_fn, 'lamda_constraint': lamda_constraint, 'v_constraint': v_constraint, 'optimize_expr': optimize_expr }
def p_train_adv(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] # changed sample = act_pd.sample() act_input_n[p_index] = sample q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] ## Modifications here ## Create values vector: auto solve rows by 1 column v = tf.tile([0.0], [tf.shape(sample)[0]]) # variable for value function for i in range(act_space_n[p_index].n): # create row tensor with ith element as 1, actions are one-hot a = np.zeros((1, act_space_n[p_index].n), dtype=np.float32) a[0, i] = 1 a = tf.convert_to_tensor(a) # tile this row tensor automatic number of times a = tf.tile(a, [tf.shape(sample)[0], 1]) act_input = act_ph_n + [] act_input[p_index] = tf.convert_to_tensor(a) q_input_tmp = tf.concat(obs_ph_n + act_input, 1) if local_q_func: q_input_tmp = tf.concat( [obs_ph_n[p_index], act_input_n[p_index]], 1) # add Q(a[i], s) * pi(a[i]) to value p_i = act_pd.logits[:, i] # tmp is q values for action i multiplied by probability of taking action i tmp = tf.multiply( q_func(q_input_tmp, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0], p_i) v = tf.add(v, tmp) a = tf.subtract(v, q) # loss is equal to advantage pg_loss = -tf.reduce_mean(a) ## Modifications end loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, adversarial, adv_eps, adv_eps_s, num_adversaries, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] if adversarial: num_agents = len(act_ph_n) if q_index < num_adversaries: adv_rate = [ adv_eps_s * (i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents) ] else: adv_rate = [ adv_eps_s * (i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents) ] print(" adv rate for q_index : ", q_index, adv_rate) pg_loss = -tf.reduce_mean(target_q) raw_perturb = tf.gradients(pg_loss, act_ph_n) perturb = [ adv_eps * tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1)) for elem in raw_perturb ] new_act_n = [ perturb[i] + act_ph_n[i] if i != q_index else act_ph_n[i] for i in range(len(act_ph_n)) ] adv_q_input = tf.concat(obs_ph_n + new_act_n, 1) target_q = q_func(adv_q_input, 1, scope='target_q_func', reuse=True, num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, adversarial, adv_eps, adv_eps_s, num_adversaries, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) if adversarial: num_agents = len(act_input_n) if p_index < num_adversaries: adv_rate = [ adv_eps_s * (i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents) ] else: adv_rate = [ adv_eps_s * (i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents) ] print(" adv rate for p_index : ", p_index, adv_rate) raw_perturb = tf.gradients(pg_loss, act_input_n) perturb = [ tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1)) for elem in raw_perturb ] perturb = [perturb[i] * adv_rate[i] for i in range(num_agents)] new_act_n = [ perturb[i] + act_input_n[i] if i != p_index else act_input_n[i] for i in range(len(act_input_n)) ] adv_q_input = tf.concat(obs_ph_n + new_act_n, 1) adv_q = q_func(adv_q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # 重用变量 # create distribtuions初始动作概率分布列表 act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n ] # 为所有agent的动作空间都创造一个动作概率分布类 # 类的集合 # set up placeholders obs_ph_n = make_obs_ph_n # 所有的agent观察到的环境信息 act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # 返回用于存放每个agent的动作的占位符集合,用于填充所有agent选择的动作[none]代表可以填入无数组数据 p_input = obs_ph_n[p_index] # 仅观察到自身周围环境 p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) # 建立神经网络,输出单元数为动作个数...这代码写的太呆了 输出每一个动作的值 p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # 获取该神经网络全部变量 # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() # 确定性动作叠加噪声进行探索,成为随机策略,得到一组act,作用未知 p_reg = tf.reduce_mean(tf.square( act_pd.flatparam())) # flatparam是所有动作的actor网络输出值的集合 # 猜测引入p_reg是因为预测其agent动作的需要 act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample( ) # 仅替换自己的动作输入,自己的动作来自于自己的policy网络输出 # 所以通过这一步将两个网络连接,通过q网络优化自己的policy网络 q_input = tf.concat(obs_ph_n + act_input_n, 1) # q输入所有的环境观察值与所有的agents采取的动作 # q的输入 if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # 这里是用的q_func由于reuse所以使用已经创建好的变量,即自己的q网络而不是再创建一个 # q_train,p_train属于同一个scope! # 策略优化目标 pg_loss = -tf.reduce_mean(q) # loss与p_reg均需要加-号进行优化 # 目标使q的均值最大,等于采样后的-reduce_mean最小 loss = pg_loss + p_reg * 1e-3 # 引入熵? # 梯度下降优化器节点表达式 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions可调用函数,批量使用session训练 train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # 依据自身观察给出确定性动作 p_values = U.function([obs_ph_n[p_index]], p) # 输出的是动作值集合 # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(name, make_obs_ph_n, adj_n, act_space_n, num_adversaries, neighbor_n, q_func, agent_n, optimizer, grad_norm_clipping=None, local_q_func=False, reuse=None, scope="trainer", num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # number of agents in this species agent_n_species = num_adversaries if name == "adversaries" else agent_n - num_adversaries # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = [ tf.placeholder(tf.float32, [None], name="target") for _ in range(agent_n_species) ] q = [] q_square = [] q_input = tf.concat(obs_ph_n + act_ph_n, 1) for a in range(agent_n_species): temp = q_func(q_input, 1, scope="q_func_%d" % a, num_units=num_units)[:, 0] q.append(temp) # q1 = tf.stack([q[i] for i in range(agent_n_species)], axis=1) # q_square = [tf.square(tf.reduce_mean(q[i] - target_ph[i], axis=1)) for i in range(agent_n_species)] q_func_vars = [ U.scope_vars(U.absolute_scope_name("q_func_%d" % i)) for i in range(agent_n_species) ] q_loss = [ tf.reduce_mean(tf.square(q[i] - target_ph[i])) for i in range(agent_n_species) ] # viscosity solution to Bellman differential equation in place of an initial condition # q_reg = tf.reduce_mean(tf.square(q1)) loss = q_loss # + 1e-3 * q_reg optimize_expr = [ U.minimize_and_clip(optimizer, loss[i], q_func_vars[i], grad_norm_clipping) for i in range(agent_n_species) ] # Create callable functions train = [ U.function(inputs=obs_ph_n + act_ph_n + [target_ph[i]], outputs=loss[i], updates=[optimize_expr[i]]) for i in range(agent_n_species) ] q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = [] for a in range(agent_n_species): temp = q_func(q_input, 1, scope="target_q_func_%d" % a, num_units=num_units)[:, 0] target_q.append(temp) target_q_func_vars = [ U.scope_vars(U.absolute_scope_name("target_q_func_%d" % i)) for i in range(agent_n_species) ] update_target_q = make_update_exp(q_func_vars, target_q_func_vars, central=False) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, q_values, target_q_values
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # make_ob_ph_n是输入的placeholder,与obs_n同shape act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # 获取概率类型,传入动作维度(5) # act_space来自于env.act_space,由实验环境决定 # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # 一维输入占位符 # 以上为三个placeholder, [None]增加维度,不知道喂进去多少数据时使用, 即None是batchsize大小 q_input = tf.concat(obs_ph_n + act_ph_n, 1) # q函数输入网络为动作加上环境,在1维上,即q网络输入是所有agent观察和动作 if local_q_func: # 用ddpg时即只用自己的行为训练 q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] # 取所有行的第0个数据 q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # q网络变量集合 q_loss = tf.reduce_mean( tf.square(q - target_ph)) # target_ph 会被什么占据呢? 会被喂进去的td target占据 # q网络的损失函数,均方差,target_ph来自于target网络的预测 # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss # + 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # 优化器表达式,以及是否梯度clip # Create callable functions # theano function train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) # 以下返回值均为theano function可以直接填入传入placeholder的参数 return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def c_next(make_obs_ph, act_space, c_ph, c_next_func, num_constraints, optimizer, grad_norm_clipping, num_units=64, reuse=False, scope="c_next"): with tf.variable_scope(scope, reuse=reuse): # set up placeholders act_pdtype = make_pdtype(act_space[0]) obs_ph = make_obs_ph act_ph = act_pdtype.sample_placeholder([None], name="action") c_next_target_ph = [] for _ in range(num_constraints): c_next_target_ph.append( tf.placeholder(tf.float32, [None, 1], name="target" + str(_))) c_next_input = tf.concat(obs_ph, 1) gs_ = [] for _ in range(num_constraints): gs_.append( c_next_func(c_next_input, int((act_pdtype.param_shape()[0]) / 2), scope="c_next_func" + str(_), num_units=num_units)) c_ = [] # to be testified for _ in range(num_constraints): temp = c_ph[_] + tf.multiply(gs_[_], act_ph) c_.append(tf.reduce_sum(temp, -1)) c_next_vars = [ U.scope_vars(U.absolute_scope_name("c_next_func" + str(_))) for _ in range(num_constraints) ] diff = [(c_[_] - c_next_target_ph[_]) for _ in range(num_constraints)] c_next_loss = [ tf.reduce_mean(tf.square(diff[_])) for _ in range(num_constraints) ] optimize_expr = [ U.minimize_and_clip(optimizer, c_next_loss[_], c_next_vars[_], grad_norm_clipping) for _ in range(num_constraints) ] # Create callable functions train = [ U.function(inputs=[obs_ph] + [act_ph] + [c_ph[_]] + [c_next_target_ph[_]], outputs=c_next_loss[_], updates=[optimize_expr[_]]) ] c_next_values = [ U.function([obs_ph] + [act_ph] + [c_ph[_]], c_[_]) for _ in range(num_constraints) ] g_next_values = [ U.function([obs_ph], gs_[_]) for _ in range(num_constraints) ] return train, c_next_values, g_next_values
def p_train(name, make_obs_ph_n, adj_n, act_space_n, neighbor_n, p_index, p_func, q_func, num_adversaries, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=128, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] agent_n = len(obs_ph_n) vec_n = U.BatchInput([1, neighbor_n], name="vec").get() p_input1 = obs_ph_n[ 0:num_adversaries] if name == "adversaries" else obs_ph_n[ num_adversaries:agent_n] p_input2 = adj_n[0:num_adversaries] if name == "adversaries" else adj_n[ num_adversaries:agent_n] p_input3 = vec_n # call for actor network # act_space is not good!!!!!!!!!! p = p_func(p_input1, p_input2, p_input3, neighbor_n, num_adversaries if name == "adversaries" else (agent_n - num_adversaries), 5, scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = [] act_sample = [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): act_pd_temp = act_pdtype_n[i].pdfromflat( p[i - (0 if name == "adversaries" else num_adversaries)]) act_pd.append(act_pd_temp) act_sample.append(act_pd_temp.sample()) temp = [] for i in range(len(act_pd)): temp.append(act_pd[i].flatparam()) # Is this regularization method correct?????????????????????????????/ p_reg = tf.reduce_mean(tf.square(temp)) act_input_n = act_ph_n + [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): act_input_n[i] = act_sample[ i - (0 if name == "adversaries" else num_adversaries)] q_input = tf.concat(obs_ph_n + act_input_n, 1) q = [] q_reduce_mean = [] for a in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): index = a if name == "adversaries" else a - num_adversaries temp = q_func(q_input, 1, scope="q_func_%d" % index, reuse=True, num_units=num_units)[:, 0] q.append(temp) q_reduce_mean += temp pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + adj_n + [vec_n], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], outputs=act_sample, list_output=True) p_values = U.function( p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], p, list_output=True) # target network target_p = p_func(p_input1, p_input2, p_input3, neighbor_n, num_adversaries if name == "adversaries" else (agent_n - num_adversaries), 5, scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars, central=True) target_act_sample = [] for i in range(0, num_adversaries) if name == "adversaries" else range( num_adversaries, agent_n): target_act_sample.append(act_pdtype_n[i].pdfromflat(target_p[i - ( 0 if name == "adversaries" else num_adversaries)]).sample()) target_act = U.function( inputs=p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else adj_n[num_adversaries:agent_n]) + [p_input3], outputs=target_act_sample, list_output=True) return act, train, update_target_p, p_values, target_act
def q_train(make_obs_ph_n, act_space_n, make_obs_history_n, make_act_history_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): """ Q-Learning make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents q_index (int): Agent index number q_func (function): MLP Neural Network model for the agent. optimizer (function): Network Optimizer function grad_norm_clipping (float): Value by which to clip the norm of the gradient local_q_func (boolean): Flag for using local q function num_units (int): The number outputs for the layers of the model scope (str): The name of the scope reuse (boolean): Flag specifying whether to reuse the scope Returns: train (function): Training function for Q network update_target_q (function): Update function for updating Q network values q_debug (dict): Contains 'q_values' and 'target_q_values' of the Q network """ with tf.variable_scope(scope, reuse=reuse): # Create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # Set up placeholders obs_ph_n = make_obs_ph_n obs_history_n = make_obs_history_n act_history_n = make_act_history_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)] # act_ph_n = [tf.concat(3*[x],1,name="action{}".format(i)) for i,x in enumerate(act_ph_n)] # Original implementation # q_input = tf.concat(obs_ph_n + act_ph_n, 1) # Modified # Current plus 2 previous time-steps q_input = tf.concat( obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1) if local_q_func: # Only have observations about myself when 'ddpg' # Importantly... self position is relative to prey q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("q_func")) # ************************************************************************************************ # ccm_input = data for ccm # ccm_value = ccm_func(ccm_input) # ************************************************************************************************ # q_loss = tf.reduce_mean(tf.square(q - target_ph)) - ccm_loss q_loss = tf.reduce_mean(tf.square(q - target_ph)) # Viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) # loss = q_loss + 1e-3 * q_reg loss = q_loss optimize_expr = tf_util.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n + act_history_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = tf_util.function( obs_ph_n + obs_history_n + act_ph_n + act_history_n, q) # Target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = tf_util.scope_vars( tf_util.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = tf_util.function( obs_ph_n + obs_history_n + act_ph_n + act_history_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train_recurrent(make_obs_ph_n, make_state_ph_n, make_obs_next_n, make_obs_pred_n, act_space_n, p_index, p_policy, p_predict, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # set up placeholders obs_ph_n = make_obs_ph_n # all obs, in shape Agent_num * batch_size * time_step * obs_shape obs_next_n = make_obs_next_n state_ph_n = make_state_ph_n obs_pred_n = make_obs_pred_n # used for action act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # p_input is local obs of an agent obs_input = obs_ph_n[p_index] state_input = state_ph_n[p_index] act_input = act_ph_n[p_index] obs_next = obs_next_n[p_index] obs_pred_input = obs_pred_n[p_index] # get output and state p, gru_out, state = p_policy( obs_input, state_input, obs_pred_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_policy", num_units=num_units) act_pd = act_pdtype_n[p_index].pdfromflat( p) # wrap parameters in distribution act_sample = act_pd.sample() # sample an action # predict the next obs obs_pred = p_predict(act_input, gru_out, int(obs_input.shape[1]), scope="p_predict", num_units=num_units) # variables for optimization p_func_vars = U.scope_vars( U.absolute_scope_name("p_policy")) + U.scope_vars( U.absolute_scope_name("p_predict")) pred_loss = tf.reduce_mean(tf.square(obs_next - obs_pred)) # predict loss p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # reg item # use critic net to get the loss about policy act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample( ) # only modify the action of this agent q_input = tf.concat( obs_ph_n + act_input_n, 1) # get the input for Q net (all obs + all action) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # get q values pg_loss = -tf.reduce_mean(q) # calculate loss to maximize Q values loss = pg_loss + p_reg * 1e-3 + pred_loss * 1e-3 optimize_expr = U.minimize_and_clip( optimizer, loss, p_func_vars, grad_norm_clipping) # update p Net parameters # Create callable functions # update P NET train = U.function(inputs=obs_ph_n + state_ph_n + act_ph_n + obs_next_n + obs_pred_n, outputs=loss, updates=[optimize_expr]) # return action and state step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=[act_sample] + [state] + [gru_out]) p_values = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=p) # target network target_p, target_gru_out, target_state = \ p_policy(obs_input, state_input, obs_pred_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_policy", num_units=num_units) target_obs_pred = p_predict(act_input, target_gru_out, int(obs_input.shape[1]), scope="target_p_predict", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_policy")) + \ U.scope_vars(U.absolute_scope_name("target_p_predict")) # update the parameters θ'i = τθi + (1 − τ)θ'i update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=[target_act_sample] + [target_state] + [target_gru_out]) # return predicted obs gru_temp = tf.placeholder(tf.float32, [None] + [num_units], name='gru_out') pred_temp = p_predict(act_input, gru_temp, int(obs_input.shape[1]), scope="p_predict", num_units=num_units) predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp], outputs=pred_temp) target_pred_temp = p_predict(act_input, gru_temp, int(obs_input.shape[1]), scope="target_p_predict", num_units=num_units) target_predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp], outputs=target_pred_temp) return step, predict, train, update_target_p, { 'p_values': p_values, 'target_step': target_step, 'target_predict': target_predict }
def p_train(make_obs_ph_n, act_space_n, make_obs_history_n, make_act_history_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): """ Policy learning guided by Q-value Args: make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents p_index (int): Agent index number p_func (function): MLP Neural Network model for the agent. q_func (function): MLP Neural Network model for the agent. optimizer (function): Network Optimizer function grad_norm_clipping (float): Value by which to clip the norm of the gradient local_q_func (boolean): Flag for using local q function num_units (int): The number outputs for the layers of the model scope (str): The name of the scope reuse (boolean): Flag specifying whether to reuse the scope Returns: act (function): Action function for retrieving agent action. train (function): Training function for P network update_target_p (function): Update function for updating P network values p_debug (dict): Contains 'p_values' and 'target_act' of the P network """ with tf.variable_scope(scope, reuse=reuse): # Create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # Set up placeholders obs_ph_n = make_obs_ph_n obs_history_n = make_obs_history_n act_history_n = make_act_history_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] ccm_ph_n = [ tf.placeholder(tf.float32, [None], name="ccm" + str(p_index)) ] ccm_lambda = [ tf.placeholder(tf.float32, [None], name="lambda" + str(p_index)) ] ccm_switch = [ tf.placeholder(tf.float32, [None], name="switch" + str(p_index)) ] # Original implementation # p_input = obs_ph_n[p_index] # Modified p_input = tf.concat([obs_ph_n[p_index], obs_history_n[p_index]], 1) p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("p_func")) # Wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # Original implementation act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() # Modified # act_input_n = act_hist_ph_n + [] # act_input_n[p_index] = act_pd.mode() # Original implementation # q_input = tf.concat(obs_ph_n + act_input_n, 1) # Modified # Current plus previous time-steps q_input = tf.concat( obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1) if local_q_func: # Only have observations about myself when 'ddpg' # Importantly... [my.x, my.y, my.dx, my.dy, r1.x] q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # This is probably because of DDPG, rather than DSPG # pg_loss = -tf.reduce_mean(q) # ************************************************************************************************ # ccm_input = something # ccm_value = ccm_func(ccm_input) pg_loss = -(1 - ccm_lambda[0]) * tf.reduce_mean(q) * (1 - ccm_switch[0]) - \ ccm_lambda[0] * ccm_ph_n[0] * ccm_switch[0] # ************************************************************************************************ loss = pg_loss + p_reg * 1e-3 optimize_expr = tf_util.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n + act_history_n + ccm_ph_n + ccm_lambda + ccm_switch, outputs=loss, updates=[optimize_expr]) # Original implementation # act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # p_values = tf_util.function([obs_ph_n[p_index]], p) # Modified act = tf_util.function(inputs=[obs_ph_n[p_index]] + [obs_history_n[p_index]], outputs=act_sample) p_values = tf_util.function(inputs=[obs_ph_n[p_index]] + [obs_history_n[p_index]], outputs=p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = tf_util.scope_vars( tf_util.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() # Original implementation # target_act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) # Modified target_act = tf_util.function(inputs=[obs_ph_n[p_index]] + [obs_history_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, num_outputs, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="coma_trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n # act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] act_ph_n = [ tf.placeholder(tf.int32, [None], name="action" + str(i)) for i in range(len(act_space_n)) ] # actor的输入为本地的obs p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="coma_p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) # 得到各个action的概率 act_sample = act_pd.sample() # sample操作即gumble softmax coma训练需要某个特定的动作,所以需要一个argmax操作 act_picked = [act.tolist().index(max(act)) for act in act_sample] p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # 为什么要加一个[] act_input_n = act_ph_n + [] # 动作概率分布 替换当前agent的动作 act_input_n[p_index] = act_picked q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, num_outputs, scope="coma_q_func", reuse=True, num_units=num_units) # 反事实基线 baseline = [ baseline_calculation(act_distribute, q_list) for act_distribute, q_list in zip(act_sample, q) ] # 根据真实采取的动作获得q actual_picked_q = [q_list[act] for act, q_list in zip(act_picked, q)] # 计算当前动作的q相对于反事实基线的差值 a = [q - b for q, b in zip(actual_picked_q, baseline)] pg_loss = -tf.reduce_mean(a) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="coma_target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }