def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) ''' print(q_input.get_shape()) print('obs_ph_n') print(obs_ph_n[7].get_shape()) print('act_input_n') print(act_input_n[4].get_shape()) ''' if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) print(tf.shape(q_input)) q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }
def group_p_train(make_obs_ph_n, act_space_n, p_index, num_adversaries, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders for a group obs_ph_n = make_obs_ph_n n_agents = len(obs_ph_n) if (p_index < num_adversaries): act_ph_ns = [[ act_pdtype_n[i].sample_placeholder([None], name="action" + str(n) + '_' + str(i)) for i in range(len(act_space_n)) ] for n in range(num_adversaries)] else: act_ph_ns = [[ act_pdtype_n[i].sample_placeholder([None], name="action" + str(n) + '_' + str(i)) for i in range(len(act_space_n)) ] for n in range(n_agents - num_adversaries)] act_ph_ns_flatten = list(chain.from_iterable(act_ph_ns)) # p_input = obs_ph_n[p_index] # one obs for a certain p_index # batchify obs for all agents in a group if (p_index < num_adversaries): # adv p_input = tf.concat(obs_ph_n[:num_adversaries], 1) p_input = tf.reshape( p_input, [-1, p_input.shape[-1].value // num_adversaries]) else: # good agent p_input = tf.concat(obs_ph_n[num_adversaries:], 1) p_input = tf.reshape( p_input, [-1, p_input.shape[-1].value // (n_agents - num_adversaries)]) # get all actions from a group p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution # un-batchify actions from a group if (p_index < num_adversaries): p = tf.reshape(p, [-1, p.shape[-1] * num_adversaries]) ps = tf.split(p, num_or_size_splits=num_adversaries, axis=1) else: p = tf.reshape(p, [-1, p.shape[-1] * (n_agents - num_adversaries)]) ps = tf.split(p, num_or_size_splits=(n_agents - num_adversaries), axis=1) # get probability distributions and action samples for a group if (p_index < num_adversaries): act_pds = [ act_pdtype_n[i].pdfromflat(ps[i]) for i in range(num_adversaries) ] act_samples = [act_pds[i].sample() for i in range(num_adversaries)] else: act_pds = [ act_pdtype_n[i].pdfromflat(ps[i - num_adversaries]) for i in range(num_adversaries, n_agents) ] act_samples = [ act_pds[i].sample() for i in range(n_agents - num_adversaries) ] # act_pd = act_pdtype_n[p_index].pdfromflat(p) # act_sample = act_pd.sample() # p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) # get average p_reg for a group p_reg = tf.reduce_mean( tf.square(tf.concat([act_pd.flatparam() for act_pd in act_pds], -1))) # act_input_n = act_ph_n + [] act_input_ns = act_ph_ns # act_input_n[p_index] = act_pd.sample() # q_input = tf.concat(obs_ph_n + act_input_n, 1) if (p_index < num_adversaries): q_inputs = [] for i in range(num_adversaries): act_input_ns[i][i] = act_pds[i].sample() q_inputs.append(tf.concat(obs_ph_n + act_input_ns[i], 1)) # batchify q_input q_input = tf.concat(q_inputs, 0) else: q_inputs = [] for i in range(n_agents - num_adversaries): act_input_ns[i][i + num_adversaries] = act_pds[i].sample() q_inputs.append(tf.concat(obs_ph_n + act_input_ns[i], 1)) # batchify q_input q_input = tf.concat(q_inputs, 0) # if local_q_func: # q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) # input group of q_input into q_func q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] pg_loss = -tf.reduce_mean(q) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_ns_flatten, outputs=loss, updates=[optimize_expr]) if (p_index < num_adversaries): print([obs_ph_n[p_index]], act_samples[p_index]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_samples[p_index]) p_values = U.function([obs_ph_n[p_index]], ps[p_index]) else: print([obs_ph_n[p_index]], act_samples[p_index - num_adversaries]) act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_samples[p_index - num_adversaries]) p_values = U.function([obs_ph_n[p_index]], ps[p_index - num_adversaries]) # target network for a group if (p_index < num_adversaries): p_input = tf.reshape(p_input, [-1, p_input.shape[-1] * num_adversaries]) p_inputs = tf.split(p_input, num_or_size_splits=num_adversaries, axis=1) target_p = p_func(p_inputs[p_index], int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) else: p_input = tf.reshape( p_input, [-1, p_input.shape[-1] * (n_agents - num_adversaries)]) p_inputs = tf.split(p_input, num_or_size_splits=(n_agents - num_adversaries), axis=1) target_p = p_func(p_inputs[p_index - num_adversaries], int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) # target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }