def c_train(make_obs_ph_n, make_target_loc_ph_n, c_index, c_func, q_func, optimizer, scope="trainer", num_units=128, grad_norm_clipping=None, reuse=tf.AUTO_REUSE, local_q_func=False): with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_ph_n = make_obs_ph_n target_loc_ph = make_target_loc_ph_n[ c_index] #tf.placeholder(tf.float32, [None,2], name="target_loc") self_obs_ph = obs_ph_n[c_index] labels_ph = tf.placeholder(tf.float32, [None, 2], name="labels") # prior network c_input = tf.concat((self_obs_ph, target_loc_ph), 1) c = c_func(c_input, 2, scope="c_func", type='cls', num_units=num_units) c_pred = tf.nn.softmax(c) c_flags = tf.greater(c_pred[:, 0], 0.5) c_func_vars = U.scope_vars(U.absolute_scope_name("c_func")) # loss and optimization loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=c, labels=labels_ph)) optimize_expr = U.minimize_and_clip(optimizer, loss, c_func_vars, grad_norm_clipping) # Create callable functions c_train = U.function( inputs=[obs_ph_n[c_index], target_loc_ph, labels_ph], outputs=loss, updates=[optimize_expr]) c_act = U.function(inputs=[obs_ph_n[c_index], target_loc_ph], outputs=c_flags) c_values = U.function([obs_ph_n[c_index], target_loc_ph], outputs=c_pred) # target network target_c_values = c_func(c_input, 2, scope="target_c_func", type='cls', num_units=num_units) target_c_pred = tf.nn.softmax(target_c_values) target_c_flags = tf.greater(target_c_pred[:, 0], 0.5) target_c_func_vars = U.scope_vars( U.absolute_scope_name("target_c_func")) update_target_c = make_update_exp(c_func_vars, target_c_func_vars) target_c_act = U.function(inputs=[obs_ph_n[c_index], target_loc_ph], outputs=target_c_flags) return c_act, c_train, update_target_c, { 'c_values': c_values, 'target_c_act': target_c_act }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=tf.AUTO_REUSE, num_units=128): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") # q network q_input = tf.concat(obs_ph_n + act_ph_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q = q_func(q_input, 1, scope="q_func", type='fit', num_units=num_units)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_loss = tf.reduce_mean(tf.square(q - target_ph)) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss #+ 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) q_values = U.function(obs_ph_n + act_ph_n, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", type='fit', num_units=num_units)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(obs_ph_n + act_ph_n, target_q) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def make_update_exp(vals, target_vals): polyak = 1.0 - 1e-2 expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append( var_target.assign(polyak * var_target + (1.0 - polyak) * var)) expression = tf.group(*expression) return U.function([], [], updates=[expression])
def p_m_train(make_obs_ph_n, make_message_ph_n, act_space_n, num_agents_obs, p_index, m_func, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=128, scope="trainer", reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n message_ph_n = make_message_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] blz_distribution = tf.placeholder(tf.float32, [None, act_space_n[p_index].n], name="blz_distribution") m_input = message_ph_n[p_index] encode_dim = m_input.get_shape().as_list()[-1] # message encoder message_encode = m_func(m_input, encode_dim, num_agents_obs, scope='m_func', num_units=num_units) m_func_vars = U.scope_vars(U.absolute_scope_name("m_func")) # policy p_input = tf.concat((obs_ph_n[p_index], message_encode), 1) p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", type='fit', num_units=num_units) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) act_sample = act_pd.sample() p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] # correlation reg k = tf.keras.losses.KLDivergence() #KL_reg = k(blz_distribution, act_sample) # q network act_input_n[p_index] = act_pd.sample() q_input = tf.concat(obs_ph_n + act_input_n, 1) if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q = q_func(q_input, 1, scope="q_func", type='fit', reuse=True, num_units=num_units)[:, 0] # loss and optimization pg_loss = -tf.reduce_mean(q) loss = pg_loss #+ KL_reg * 1e-2 optimize_expr = U.minimize_and_clip(optimizer, loss, [p_func_vars, m_func_vars], grad_norm_clipping) # Create callable functions train = U.function(inputs=obs_ph_n + message_ph_n + act_ph_n + [blz_distribution], outputs=loss, updates=[optimize_expr]) act = U.function(inputs=[obs_ph_n[p_index], message_ph_n[p_index]], outputs=act_sample) p_values = U.function([obs_ph_n[p_index], message_ph_n[p_index]], outputs=p) # target network target_message_encode = m_func(m_input, encode_dim, num_agents_obs, scope='target_m_func', num_units=num_units) target_m_func_vars = U.scope_vars( U.absolute_scope_name("target_m_func")) p_input = tf.concat((obs_ph_n[p_index], target_message_encode), 1) target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", type='fit', num_units=num_units) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_m = make_update_exp(m_func_vars, target_m_func_vars) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function( inputs=[obs_ph_n[p_index], message_ph_n[p_index]], outputs=target_act_sample) return act, train, update_target_p, update_target_m, { 'p_values': p_values, 'target_act': target_act }