def p_approx_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n # [U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))] act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] act_logits_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action_mode" + str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func") p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() #p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) p_reg = -tf.reduce_mean(act_pd.entropy()) act_input_n = act_ph_n + [] act_target = act_input_n[p_index] pg_loss = -tf.reduce_mean(act_pd.logp(act_target)) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) sync_target_p = make_update_exp(p_func_vars, target_p_func_vars, rate=1.0) target_pd = act_pdtype_n[p_index].pdfromflat(target_p) target_act_sample = target_pd.sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) target_ph_pd = act_pdtype_n[p_index].pdfromflat(act_logits_ph_n[p_index]) kl_loss = tf.reduce_mean(target_pd.kl(target_ph_pd)) f_kl_loss = U.function(inputs=[obs_ph_n[p_index],act_logits_ph_n[p_index]],outputs=kl_loss) return act, train, update_target_p, sync_target_p, {'p_values': p_values, 'kl_loss': f_kl_loss, 'target_act': target_act}
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) act_test = U.function(inputs=[obs_ph_n[p_index]], outputs=p) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act_test, act, train, update_target_p, {'p_values': p_values, 'target_act': target_act, 'p_vars': p_func_vars, 'target_p_vars': target_p_func_vars}
def q_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64, discrete_action=False, target_update_tau=0.001, use_global_state=False, share_weights=False): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders if not use_global_state: obs_ph_n = make_obs_ph_n else: obs_ph_n = make_state_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) if share_weights: # add agent id to input as layers share weights q_input = tf.concat([q_input, tf.tile(tf.eye(n_agents)[q_index:q_index+1], [tf.shape(q_input)[0], 1])], -1) q = q_func(q_input, 1, scope="q_func", reuse=share_weights, num_units=num_units, constrain_out=False, discrete_action=discrete_action)[:, 0] #share_weights)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", reuse=share_weights, num_units=num_units, constrain_out=False, discrete_action=discrete_action)[:, 0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars, target_update_tau) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, shared_CNN, optimizer, make_obs_map_ph_n,grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n obs_map_ph_n=make_obs_map_ph_n obs_ph_next = obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None], name="target") vf_input = tf.concat(obs_ph_next, 1) q_input = tf.concat(obs_ph_n + act_ph_n, 1) for i in range(len(obs_ph_n)): q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1) vf_input=tf.concat([vf_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1) with tf.variable_scope(scope, reuse=None): if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] vf = q_func(vf_input, 1, scope="vf_func",num_units=num_units)[:,0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func")) CNN_vars=U.scope_vars(U.absolute_scope_name("CNN")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars+CNN_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + obs_map_ph_n+act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + obs_map_ph_n+act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + obs_map_ph_n+act_ph_n, target_q) target_vf_values = U.function(obs_ph_n +obs_map_ph_n, vf) return train, update_target_q, {'q_values': q_values, 'target_vf_values': target_vf_values}
def q_LSTM_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None, 1], name="action"+str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None, 1], name="target") q_res = 1 q_c_ph, q_h_ph = get_lstm_state_ph(name='q_', n_batches=None, num_units=num_units) q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n))], [q_h_ph for i in range(len(obs_ph_n))] # need to check this -- need safety checks # q_input = tf.concat(obs_ph_n + act_ph_n, -1) q_input = tf.concat(obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, -1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], -1) q, q_state_out = q_func(q_input, 1, scope="q_func", num_units=num_units) q = q[:,0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions q_values = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=[q, q_state_out]) train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n+ [target_ph], outputs=loss, updates=[optimize_expr]) # target network target_q, target_q_state_out = q_func(q_input, 1, scope="target_q_func", num_units=num_units) target_q = target_q[:,0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def make_update_exp(vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var)) expression = tf.group(*expression) return U.function([], [], updates=[expression])
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] policy = shana( env_spec=env, af = 15, of = 22, K=2, hidden_layer_sizes=(100, 100), qf=q_func, reg=0.001 ) actions, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index], with_log_pis=True) print(actions) print(log_pi) p_func_vars = policy.get_params_internal() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] pg_loss = -tf.reduce_mean(q) p_reg = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, scope=policy.name) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def make_update_exp(vals, target_vals): # softupdate两个神经网络的变量 polyak = 1.0 - 1e-2 # 0.99 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) # target网络和当前网络 expression = tf.group(*expression) # expression被会话调用,其中所有变量均会被调用生效 return U.function([], [], updates=[expression]) # 更新target_vals网络节点的值
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n # [U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))] act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # q_loss = tf.reduce_mean(U.huber_loss(q - target_ph)) # TEMP: just want to give an viscosity solution to Bellman differential equation # in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss + 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) sync_target_q = make_update_exp(q_func_vars, target_q_func_vars, rate=1.0) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, sync_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) # DDPG just owns its [local] information q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg # From chapter 4.2: inferring policies of other policies ??? optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # [Important]使用 optimizer 来降低 loss, 其中变量表在 q_func_vars 中, # 保证每个变量的梯度到 grad_norm_clipping -- 梯度剪切 # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype = make_pdtype(act_space_n[0]) # set up placeholders obs_ph_n = make_obs_ph_n act_ph = [act_pdtype.sample_placeholder([None], name="action"+str(0))] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph[0]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph, target_q) return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args): self.name = name self.n = len(obs_shape_n) self.agent_index = agent_index self.args = args # create dummy tensor flow variables to avoid Saver error # TODO: remove this or turn into act function obs_ph_n = [] for i in range(self.n): obs_ph_n.append( U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) with tf.variable_scope(self.name, reuse=None): self.dummy_var = U.function(obs_ph_n, outputs=tf.Variable(0))
def make_update_exp(vals, target_vals, central=True): polyak = 1.0 - 1e-2 expression = [] agent_n_species = len(vals) if central: for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) else: for a in range(agent_n_species): for var, var_target in zip( sorted(vals[a], key=lambda v: v.name), sorted(target_vals[a], key=lambda v: v.name)): expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return U.function([], [], updates=[expression])
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() # Create callable functions act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) return act
def make_update_exp(vals, target_vals): """ Update target network values using polyak averaging (exponentially decaying average). Args: vals (tf.Variable): Network variables target_vals (tf.Variable): Target network variables Returns Updated target network values """ # Polyak coefficient for Polyak-averaging of the target network polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): # Exponentially decaying average expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return tf_util.function([], [], updates=[expression])
def q_train(make_obs_ph_n, act_space_n, make_obs_history_n, make_act_history_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): """ Q-Learning make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents q_index (int): Agent index number q_func (function): MLP Neural Network model for the agent. optimizer (function): Network Optimizer function grad_norm_clipping (float): Value by which to clip the norm of the gradient local_q_func (boolean): Flag for using local q function num_units (int): The number outputs for the layers of the model scope (str): The name of the scope reuse (boolean): Flag specifying whether to reuse the scope Returns: train (function): Training function for Q network update_target_q (function): Update function for updating Q network values q_debug (dict): Contains 'q_values' and 'target_q_values' of the Q network """ with tf.variable_scope(scope, reuse=reuse): # Create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # Set up placeholders obs_ph_n = make_obs_ph_n obs_history_n = make_obs_history_n act_history_n = make_act_history_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)] # act_ph_n = [tf.concat(3*[x],1,name="action{}".format(i)) for i,x in enumerate(act_ph_n)] # Original implementation # q_input = tf.concat(obs_ph_n + act_ph_n, 1) # Modified # Current plus 2 previous time-steps q_input = tf.concat( obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1) if local_q_func: # Only have observations about myself when 'ddpg' # Importantly... self position is relative to prey q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("q_func")) # ************************************************************************************************ # ccm_input = data for ccm # ccm_value = ccm_func(ccm_input) # ************************************************************************************************ # q_loss = tf.reduce_mean(tf.square(q - target_ph)) - ccm_loss q_loss = tf.reduce_mean(tf.square(q - target_ph)) # Viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) # loss = q_loss + 1e-3 * q_reg loss = q_loss optimize_expr = tf_util.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n + act_history_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = tf_util.function( obs_ph_n + obs_history_n + act_ph_n + act_history_n, q) # Target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = tf_util.scope_vars( tf_util.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = tf_util.function( obs_ph_n + obs_history_n + act_ph_n + act_history_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) # Gradient computation mods # --------------------------------------------------------------------------------------------- obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])] act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])] obs_flat_ph = tf.placeholder(tf.float32, shape=[None] + obs_flat_shape, name="obs_flat_input") act_flat_ph = tf.placeholder(tf.float32, shape=[None] + act_flat_shape, name="act_flat_input") q_vec_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1) serial_q = q_func(q_vec_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # calculate gradient of serial q value wrt actions raw_grad = tf.gradients(serial_q, act_flat_ph) grad_norm = tf.divide(raw_grad, tf.norm(raw_grad)) grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm) # --------------------------------------------------------------------------------------------- loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act, 'grad_norm_value': grad_norm_value }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # get flattened obs and act shape act_shape = tf.shape(act_ph_n) act_serial = tf.concat(act_ph_n, 1) act_serial = tf.reshape(act_serial, [act_shape[1], act_shape[0] * act_shape[-1]]) act_serial_values = U.function(act_ph_n, act_serial) obs_shape = tf.shape(obs_ph_n) obs_serial = tf.concat(obs_ph_n, 1) obs_serial = tf.reshape(obs_serial, [obs_shape[1], obs_shape[0] * obs_shape[-1]]) obs_serial_values = U.function(obs_ph_n, obs_serial) obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])] act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])] obs_flat_ph = tf.placeholder(tf.float32, shape=[None] + obs_flat_shape, name="obs_flat_input") act_flat_ph = tf.placeholder(tf.float32, shape=[None] + act_flat_shape, name="act_flat_input") target_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1) q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network # target_orig_q = q_func(q_input, 1, scope="target_orig_q_func", num_units=num_units)[:,0] target_q = q_func(target_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) # target_q_values = U.function(obs_ph_n + act_ph_n, target_q) target_q_values = U.function([obs_flat_ph, act_flat_ph], target_q) # calculate gradient of target q value wrt actions raw_grad = tf.gradients(target_q, act_flat_ph) grad_norm = tf.divide(raw_grad, tf.norm(raw_grad)) grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values, 'act_serial_values': act_serial_values, 'obs_serial_values': obs_serial_values, 'grad_norm_value': grad_norm_value }
def p_train_recurrent(make_obs_ph_n, make_state_ph_n, make_obs_next_n, make_obs_pred_n, act_space_n, p_index, p_policy, p_predict, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # set up placeholders obs_ph_n = make_obs_ph_n # all obs, in shape Agent_num * batch_size * time_step * obs_shape obs_next_n = make_obs_next_n state_ph_n = make_state_ph_n obs_pred_n = make_obs_pred_n # used for action act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # p_input is local obs of an agent obs_input = obs_ph_n[p_index] state_input = state_ph_n[p_index] act_input = act_ph_n[p_index] obs_next = obs_next_n[p_index] obs_pred_input = obs_pred_n[p_index] # get output and state p, gru_out, state = p_policy( obs_input, state_input, obs_pred_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_policy", num_units=num_units) act_pd = act_pdtype_n[p_index].pdfromflat( p) # wrap parameters in distribution act_sample = act_pd.sample() # sample an action # predict the next obs obs_pred = p_predict(act_input, gru_out, int(obs_input.shape[1]), scope="p_predict", num_units=num_units) # variables for optimization p_func_vars = U.scope_vars( U.absolute_scope_name("p_policy")) + U.scope_vars( U.absolute_scope_name("p_predict")) pred_loss = tf.reduce_mean(tf.square(obs_next - obs_pred)) # predict loss p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # reg item # use critic net to get the loss about policy act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample( ) # only modify the action of this agent q_input = tf.concat( obs_ph_n + act_input_n, 1) # get the input for Q net (all obs + all action) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # get q values pg_loss = -tf.reduce_mean(q) # calculate loss to maximize Q values loss = pg_loss + p_reg * 1e-3 + pred_loss * 1e-3 optimize_expr = U.minimize_and_clip( optimizer, loss, p_func_vars, grad_norm_clipping) # update p Net parameters # Create callable functions # update P NET train = U.function(inputs=obs_ph_n + state_ph_n + act_ph_n + obs_next_n + obs_pred_n, outputs=loss, updates=[optimize_expr]) # return action and state step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=[act_sample] + [state] + [gru_out]) p_values = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=p) # target network target_p, target_gru_out, target_state = \ p_policy(obs_input, state_input, obs_pred_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_policy", num_units=num_units) target_obs_pred = p_predict(act_input, target_gru_out, int(obs_input.shape[1]), scope="target_p_predict", num_units=num_units) target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_policy")) + \ U.scope_vars(U.absolute_scope_name("target_p_predict")) # update the parameters θ'i = τθi + (1 − τ)θ'i update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] + [obs_pred_n[p_index]], outputs=[target_act_sample] + [target_state] + [target_gru_out]) # return predicted obs gru_temp = tf.placeholder(tf.float32, [None] + [num_units], name='gru_out') pred_temp = p_predict(act_input, gru_temp, int(obs_input.shape[1]), scope="p_predict", num_units=num_units) predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp], outputs=pred_temp) target_pred_temp = p_predict(act_input, gru_temp, int(obs_input.shape[1]), scope="target_p_predict", num_units=num_units) target_predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp], outputs=target_pred_temp) return step, predict, train, update_target_p, { 'p_values': p_values, 'target_step': target_step, 'target_predict': target_predict }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, scope="coma_trainer", reuse=None, num_units=64, num_outputs=1): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ tf.placeholder(tf.int32, [None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = [ tf.placeholder(tf.float32, [None], name="target") for _ in range(len(act_space_n)) ] # 在一维进行拼接 q_input = tf.concat(obs_ph_n + act_ph_n, 1) q = q_func(q_input, num_outputs, scope="coma_q_func", num_units=num_units) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, num_outputs, scope="coma_target_q_func", num_units=num_units) target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def dqn_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func="dqn", num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() target_ph = tf.placeholder(tf.float32, [None], name="target") tf_p = tf.reduce_sum(p, reduction_indices=1) loss = tf.reduce_mean(tf.square(tf_p - target_ph)) optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, shared_CNN, optimizer, make_obs_map_ph_n,grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] obs_map_ph_n=make_obs_map_ph_n p_map_input=obs_map_ph_n[p_index] policy = shana( env_spec=env, af = 15, of = 25+12, K=2, hidden_layer_sizes=(128, 128), qf=q_func, reg=0.001 ) map_context_input=shared_CNN(p_map_input,p_index,scope="CNN") CNN_vars=U.scope_vars(U.absolute_scope_name("CNN")) act, log_pi = policy.actions_for(observations=tf.concat([make_obs_ph_n[p_index],map_context_input],1), with_log_pis=True) act_input_n = act_ph_n + [] act_input_n[p_index]=act p_func_vars = policy.get_params_internal() q_input = tf.concat(obs_ph_n + act_input_n, 1) vf_input = tf.concat(obs_ph_n, 1) for i in range(len(obs_ph_n)): q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1) vf_input=tf.concat([vf_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1) with tf.variable_scope(scope, reuse=None): if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] vf = q_func(vf_input, 1, scope="vf_func",reuse=True, num_units=num_units)[:,0] vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func")) pg_loss = tf.reduce_mean(log_pi * tf.stop_gradient(log_pi - q + vf)) p_reg = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, scope=policy.name) loss = pg_loss + p_reg vf_loss = 0.5 * tf.reduce_mean((vf - tf.stop_gradient(q - log_pi))**2) optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) mikoto = U.minimize_and_clip(optimizer, vf_loss, vf_func_vars, grad_norm_clipping) with tf.variable_scope(scope, reuse=True): CNN_p_optimizer=U.minimize_and_clip(optimizer, loss, CNN_vars, grad_norm_clipping) CNN_v_optimizaer=U.minimize_and_clip(optimizer, vf_loss, CNN_vars, grad_norm_clipping) with tf.variable_scope(scope, reuse=None): # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n+obs_map_ph_n, outputs=loss, updates=[optimize_expr,CNN_p_optimizer]) misaka = U.function(inputs=obs_ph_n + act_ph_n+obs_map_ph_n, outputs=loss, updates=[mikoto,CNN_v_optimizaer]) # target network target_p = shana( env_spec=env, af = 15, of = 25+12, K=2, hidden_layer_sizes=(128, 128), qf=q_func, reg=0.001, name = 'target_policy' ) target_p_func_vars = target_p.get_params_internal() target_vf = q_func(vf_input, 1, scope="target_vf_func", num_units=num_units)[:,0] target_vf_func_vars = U.scope_vars(U.absolute_scope_name("target_vf_func")) target_act_r, tar_log = target_p.actions_for(observations=tf.concat([obs_ph_n[p_index],map_context_input],1),with_log_pis=True) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) upvf = make_update_exp(vf_func_vars, target_vf_func_vars) target_act = U.function(inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]], outputs=target_act_r) act=U.function(inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]], outputs=act) return act, train, misaka, update_target_p, upvf, {'target_act': target_act}
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, num_outputs, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="coma_trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n # act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] act_ph_n = [ tf.placeholder(tf.int32, [None], name="action" + str(i)) for i in range(len(act_space_n)) ] # actor的输入为本地的obs p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="coma_p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) # 得到各个action的概率 act_sample = act_pd.sample() # sample操作即gumble softmax coma训练需要某个特定的动作,所以需要一个argmax操作 act_picked = [act.tolist().index(max(act)) for act in act_sample] p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # 为什么要加一个[] act_input_n = act_ph_n + [] # 动作概率分布 替换当前agent的动作 act_input_n[p_index] = act_picked q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, num_outputs, scope="coma_q_func", reuse=True, num_units=num_units) # 反事实基线 baseline = [ baseline_calculation(act_distribute, q_list) for act_distribute, q_list in zip(act_sample, q) ] # 根据真实采取的动作获得q actual_picked_q = [q_list[act] for act, q_list in zip(act_picked, q)] # 计算当前动作的q相对于反事实基线的差值 a = [q - b for q, b in zip(actual_picked_q, baseline)] pg_loss = -tf.reduce_mean(a) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="coma_target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def c_next(make_obs_ph, act_space, c_ph, c_next_func, num_constraints, optimizer, grad_norm_clipping, num_units=64, reuse=False, scope="c_next"): with tf.variable_scope(scope, reuse=reuse): # set up placeholders act_pdtype = make_pdtype(act_space[0]) obs_ph = make_obs_ph act_ph = act_pdtype.sample_placeholder([None], name="action") c_next_target_ph = [] for _ in range(num_constraints): c_next_target_ph.append( tf.placeholder(tf.float32, [None, 1], name="target" + str(_))) c_next_input = tf.concat(obs_ph, 1) gs_ = [] for _ in range(num_constraints): gs_.append( c_next_func(c_next_input, int((act_pdtype.param_shape()[0]) / 2), scope="c_next_func" + str(_), num_units=num_units)) c_ = [] # to be testified for _ in range(num_constraints): temp = c_ph[_] + tf.multiply(gs_[_], act_ph) c_.append(tf.reduce_sum(temp, -1)) c_next_vars = [ U.scope_vars(U.absolute_scope_name("c_next_func" + str(_))) for _ in range(num_constraints) ] diff = [(c_[_] - c_next_target_ph[_]) for _ in range(num_constraints)] c_next_loss = [ tf.reduce_mean(tf.square(diff[_])) for _ in range(num_constraints) ] optimize_expr = [ U.minimize_and_clip(optimizer, c_next_loss[_], c_next_vars[_], grad_norm_clipping) for _ in range(num_constraints) ] # Create callable functions train = [ U.function(inputs=[obs_ph] + [act_ph] + [c_ph[_]] + [c_next_target_ph[_]], outputs=c_next_loss[_], updates=[optimize_expr[_]]) ] c_next_values = [ U.function([obs_ph] + [act_ph] + [c_ph[_]], c_[_]) for _ in range(num_constraints) ] g_next_values = [ U.function([obs_ph], gs_[_]) for _ in range(num_constraints) ] return train, c_next_values, g_next_values
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, u_func, optimizer, optimizer_lamda, exp_var_alpha=None, cvar_alpha=None, cvar_beta=None, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64, u_estimation=False, constrained=True, constraint_type=None, agent_type=None): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): if constrained: lamda_constraint = tf.get_variable( 'lamda_constraint' + str(q_index), [1], initializer=tf.constant_initializer(1.0), dtype=tf.float32) if constraint_type == "CVAR": v_constraint = tf.get_variable( 'v_constraint' + str(q_index), [1], initializer=tf.constant_initializer(1.0), dtype=tf.float32) # create distribtuions act_pdtype_n = make_pdtype(act_space_n[q_index]) # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [act_pdtype_n.sample_placeholder([None], name="action0")] target_ph = tf.placeholder(tf.float32, [None], name="target") if u_estimation: target_ph_u = tf.placeholder(tf.float32, [None], name="target_u") rew = tf.placeholder(tf.float32, [None], name="reward") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[0], act_ph_n[0]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] if u_estimation: u_input = tf.concat(obs_ph_n + act_ph_n, 1) u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0] u_loss = tf.reduce_mean( tf.square( tf.square(rew) + 2 * tf.multiply(rew, target_ph) + target_ph_u - u)) var = u - tf.square(q) else: var = tf.square(rew + target_ph) - tf.square(q) if constrained: if constraint_type == "Exp_Var": #print ('In constraint generation with lamda alpha') constraint = lamda_constraint * (var - exp_var_alpha) q_loss = tf.reduce_mean( tf.square(q - (target_ph + rew - constraint))) elif constraint_type == "CVAR": cvar = v_constraint + (1.0 / (1.0 - cvar_beta)) * tf.reduce_mean( tf.nn.relu(q - v_constraint)) constraint = lamda_constraint * (cvar_alpha - cvar) q_loss = tf.reduce_mean( tf.square(q - (target_ph + rew - constraint))) else: q_loss = tf.reduce_mean(tf.square(q - (target_ph + rew))) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) if u_estimation: u_func_vars = U.scope_vars(U.absolute_scope_name("u_func")) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) train3 = None if u_estimation: loss = q_loss + u_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + u_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=[q_loss, u_loss], updates=[optimize_expr]) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=var) elif constraint_type == "CVAR": loss = q_loss optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars + [v_constraint], grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=q_loss, updates=[optimize_expr]) cvar_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=cvar) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=var) else: #print ('in loss minimization over q_func_vars') loss = q_loss optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=q_loss, updates=[optimize_expr]) var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=var) if not constrained: optimize_expr3 = U.minimize_and_clip(optimizer, loss, [v_constraint], grad_norm_clipping) train3 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=q_loss, updates=[optimize_expr3]) #loss = loss + 1e-4*q_reg # Create callable functions q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) if u_estimation: u_values = U.function(obs_ph_n + act_ph_n, u) target_u = u_func(u_input, 1, scope="target_u_func", num_units=num_units)[:, 0] target_u_func_vars = U.scope_vars( U.absolute_scope_name("target_u_func")) update_target_u = make_update_exp(u_func_vars, target_u_func_vars) target_u_values = U.function(obs_ph_n + act_ph_n, target_u) if constrained: loss2 = -loss #print ('in loss maximisation over lamda') optimize_expr2 = U.minimize_and_clip(optimizer_lamda, loss2, [lamda_constraint], grad_norm_clipping) if u_estimation: train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [target_ph_u] + [rew], outputs=loss2, updates=[optimize_expr2]) else: train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew], outputs=loss2, updates=[optimize_expr2]) if not u_estimation: update_target_u = None target_u_values = None u_values = None if not constrained: train2 = None lamda_constraint = None if constraint_type != "CVAR": cvar_fn = None v_constraint = None return train, train2, train3, update_target_q, update_target_u, { 'q_values': q_values, 'u_values': u_values, 'target_q_values': target_q_values, 'target_u_values': target_u_values, 'var': var_fn, 'cvar': cvar_fn, 'lamda_constraint': lamda_constraint, 'v_constraint': v_constraint, 'optimize_expr': optimize_expr }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, adversarial, adv_eps, adv_eps_s, num_adversaries, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) if adversarial: num_agents = len(act_input_n) if p_index < num_adversaries: adv_rate = [ adv_eps_s * (i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents) ] else: adv_rate = [ adv_eps_s * (i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents) ] print(" adv rate for p_index : ", p_index, adv_rate) raw_perturb = tf.gradients(pg_loss, act_input_n) perturb = [ tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1)) for elem in raw_perturb ] perturb = [perturb[i] * adv_rate[i] for i in range(num_agents)] new_act_n = [ perturb[i] + act_input_n[i] if i != p_index else act_input_n[i] for i in range(len(act_input_n)) ] adv_q_input = tf.concat(obs_ph_n + new_act_n, 1) adv_q = q_func(adv_q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions # make_ob_ph_n是输入的placeholder,与obs_n同shape act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # 获取概率类型,传入动作维度(5) # act_space来自于env.act_space,由实验环境决定 # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # 一维输入占位符 # 以上为三个placeholder, [None]增加维度,不知道喂进去多少数据时使用, 即None是batchsize大小 q_input = tf.concat(obs_ph_n + act_ph_n, 1) # q函数输入网络为动作加上环境,在1维上,即q网络输入是所有agent观察和动作 if local_q_func: # 用ddpg时即只用自己的行为训练 q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] # 取所有行的第0个数据 q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # q网络变量集合 q_loss = tf.reduce_mean( tf.square(q - target_ph)) # target_ph 会被什么占据呢? 会被喂进去的td target占据 # q网络的损失函数,均方差,target_ph来自于target网络的预测 # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss # + 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # 优化器表达式,以及是否梯度clip # Create callable functions # theano function train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) # 以下返回值均为theano function可以直接填入传入placeholder的参数 return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # 重用变量 # create distribtuions初始动作概率分布列表 act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n ] # 为所有agent的动作空间都创造一个动作概率分布类 # 类的集合 # set up placeholders obs_ph_n = make_obs_ph_n # 所有的agent观察到的环境信息 act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] # 返回用于存放每个agent的动作的占位符集合,用于填充所有agent选择的动作[none]代表可以填入无数组数据 p_input = obs_ph_n[p_index] # 仅观察到自身周围环境 p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) # 建立神经网络,输出单元数为动作个数...这代码写的太呆了 输出每一个动作的值 p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # 获取该神经网络全部变量 # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() # 确定性动作叠加噪声进行探索,成为随机策略,得到一组act,作用未知 p_reg = tf.reduce_mean(tf.square( act_pd.flatparam())) # flatparam是所有动作的actor网络输出值的集合 # 猜测引入p_reg是因为预测其agent动作的需要 act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample( ) # 仅替换自己的动作输入,自己的动作来自于自己的policy网络输出 # 所以通过这一步将两个网络连接,通过q网络优化自己的policy网络 q_input = tf.concat(obs_ph_n + act_input_n, 1) # q输入所有的环境观察值与所有的agents采取的动作 # q的输入 if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # 这里是用的q_func由于reuse所以使用已经创建好的变量,即自己的q网络而不是再创建一个 # q_train,p_train属于同一个scope! # 策略优化目标 pg_loss = -tf.reduce_mean(q) # loss与p_reg均需要加-号进行优化 # 目标使q的均值最大,等于采样后的-reduce_mean最小 loss = pg_loss + p_reg * 1e-3 # 引入熵? # 梯度下降优化器节点表达式 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions可调用函数,批量使用session训练 train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # 依据自身观察给出确定性动作 p_values = U.function([obs_ph_n[p_index]], p) # 输出的是动作值集合 # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def p_train(make_obs_ph_n, act_space_n, make_obs_history_n, make_act_history_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): """ Policy learning guided by Q-value Args: make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents act_space_n (list): A list of the action spaces for all agents make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents p_index (int): Agent index number p_func (function): MLP Neural Network model for the agent. q_func (function): MLP Neural Network model for the agent. optimizer (function): Network Optimizer function grad_norm_clipping (float): Value by which to clip the norm of the gradient local_q_func (boolean): Flag for using local q function num_units (int): The number outputs for the layers of the model scope (str): The name of the scope reuse (boolean): Flag specifying whether to reuse the scope Returns: act (function): Action function for retrieving agent action. train (function): Training function for P network update_target_p (function): Update function for updating P network values p_debug (dict): Contains 'p_values' and 'target_act' of the P network """ with tf.variable_scope(scope, reuse=reuse): # Create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # Set up placeholders obs_ph_n = make_obs_ph_n obs_history_n = make_obs_history_n act_history_n = make_act_history_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] ccm_ph_n = [ tf.placeholder(tf.float32, [None], name="ccm" + str(p_index)) ] ccm_lambda = [ tf.placeholder(tf.float32, [None], name="lambda" + str(p_index)) ] ccm_switch = [ tf.placeholder(tf.float32, [None], name="switch" + str(p_index)) ] # Original implementation # p_input = obs_ph_n[p_index] # Modified p_input = tf.concat([obs_ph_n[p_index], obs_history_n[p_index]], 1) p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("p_func")) # Wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # Original implementation act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() # Modified # act_input_n = act_hist_ph_n + [] # act_input_n[p_index] = act_pd.mode() # Original implementation # q_input = tf.concat(obs_ph_n + act_input_n, 1) # Modified # Current plus previous time-steps q_input = tf.concat( obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1) if local_q_func: # Only have observations about myself when 'ddpg' # Importantly... [my.x, my.y, my.dx, my.dy, r1.x] q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] # This is probably because of DDPG, rather than DSPG # pg_loss = -tf.reduce_mean(q) # ************************************************************************************************ # ccm_input = something # ccm_value = ccm_func(ccm_input) pg_loss = -(1 - ccm_lambda[0]) * tf.reduce_mean(q) * (1 - ccm_switch[0]) - \ ccm_lambda[0] * ccm_ph_n[0] * ccm_switch[0] # ************************************************************************************************ loss = pg_loss + p_reg * 1e-3 optimize_expr = tf_util.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n + act_history_n + ccm_ph_n + ccm_lambda + ccm_switch, outputs=loss, updates=[optimize_expr]) # Original implementation # act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # p_values = tf_util.function([obs_ph_n[p_index]], p) # Modified act = tf_util.function(inputs=[obs_ph_n[p_index]] + [obs_history_n[p_index]], outputs=act_sample) p_values = tf_util.function(inputs=[obs_ph_n[p_index]] + [obs_history_n[p_index]], outputs=p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = tf_util.scope_vars( tf_util.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() # Original implementation # target_act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) # Modified target_act = tf_util.function(inputs=[obs_ph_n[p_index]] + [obs_history_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, adversarial, adv_eps, adv_eps_s, num_adversaries, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] if adversarial: num_agents = len(act_ph_n) if q_index < num_adversaries: adv_rate = [ adv_eps_s * (i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents) ] else: adv_rate = [ adv_eps_s * (i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents) ] print(" adv rate for q_index : ", q_index, adv_rate) pg_loss = -tf.reduce_mean(target_q) raw_perturb = tf.gradients(pg_loss, act_ph_n) perturb = [ adv_eps * tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1)) for elem in raw_perturb ] new_act_n = [ perturb[i] + act_ph_n[i] if i != q_index else act_ph_n[i] for i in range(len(act_ph_n)) ] adv_q_input = tf.concat(obs_ph_n + new_act_n, 1) target_q = q_func(adv_q_input, 1, scope='target_q_func', reuse=True, num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }