def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, layer_norm=True): with tf.variable_scope(scope, reuse=reuse): # create distribtuions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] weight_ph = tf.placeholder(tf.float32, [None], name="important_weight") p_input = obs_ph_n[p_index] p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=FLAGS.num_units, layer_norm=layer_norm) p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) reg_loss = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(FLAGS.lambda2), p_func_vars) # wrap parameters in distribution act_pd = act_pdtype_n[p_index].pdfromflat(p) # TODO: 这里添加了 deterministic action determin_act_sample, act_sample = act_pd.sample(deterministic=True) # p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) act_input_n = act_ph_n + [] act_input_n[p_index] = act_pd.sample( ) # act_pd.mode() sample action from current policy # build q-function input # print("no adv state info, no adv action info ...") if p_index < FLAGS.num_adversaries: # predator q_input = tf.concat( obs_ph_n[:FLAGS.num_adversaries] + act_input_n[:FLAGS.num_adversaries], 1) train_obs_input = obs_ph_n[:FLAGS.num_adversaries] train_action_input = act_ph_n[:FLAGS.num_adversaries] else: q_input = tf.concat( obs_ph_n[FLAGS.num_adversaries:] + act_input_n[FLAGS.num_adversaries:], 1) train_obs_input = obs_ph_n[FLAGS.num_adversaries:] train_action_input = act_ph_n[FLAGS.num_adversaries:] q_num_units = FLAGS.num_units_ma # cell number for maddpg if local_q_func: q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) q_num_units = FLAGS.num_units # cell number for ddpg q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=q_num_units, layer_norm=layer_norm)[:, 0] # pg_loss = -tf.reduce_mean(q * weight_ph) pg_loss = -tf.reduce_mean(q) loss = pg_loss + reg_loss # loss = pg_loss # return act = U.function(inputs=[obs_ph_n[p_index]], outputs=[act_sample, determin_act_sample]) p_values = U.function([obs_ph_n[p_index]], p) # target network target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=FLAGS.num_units, layer_norm=layer_norm) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars) target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) # build optimizer optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions train = U.function( inputs=train_obs_input + train_action_input + [weight_ph], # outputs=[loss, pg_loss, distance, reg_loss], outputs=[], updates=[optimize_expr]) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act, 'act_pdtype': act_pdtype_n[p_index] }
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, layer_norm=True): with tf.variable_scope(scope, reuse=reuse): # create distributions act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # set up placeholders obs_ph_n = make_obs_ph_n act_ph_n = [ act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n)) ] target_ph = tf.placeholder(tf.float32, [None], name="target") return_ph = tf.placeholder(tf.float32, [None], name="return") dis_2_end_ph = tf.placeholder(tf.float32, [None], name="dis_2_end") lambda1_ph = tf.placeholder(tf.float32, shape=[], name='lambda1') weight_ph = tf.placeholder(tf.float32, [None], name="important_weight") # build q-function input if q_index < FLAGS.num_adversaries: # predator q_input = tf.concat( obs_ph_n[:FLAGS.num_adversaries] + act_ph_n[:FLAGS.num_adversaries], 1) train_obs_input = obs_ph_n[:FLAGS.num_adversaries] train_action_input = act_ph_n[:FLAGS.num_adversaries] else: q_input = tf.concat( obs_ph_n[FLAGS.num_adversaries:] + act_ph_n[FLAGS.num_adversaries:], 1) train_obs_input = obs_ph_n[FLAGS.num_adversaries:] train_action_input = act_ph_n[FLAGS.num_adversaries:] q_num_units = FLAGS.num_units_ma # cell number for maddpg if local_q_func: q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) q_num_units = FLAGS.num_units # cell number for ddpg q = q_func(q_input, 1, scope="q_func", num_units=q_num_units, layer_norm=layer_norm)[:, 0] q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) reg_loss = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(FLAGS.lambda2), q_func_vars) # TODO: for using prioritized replay buffer, adding weight td_0 = target_ph - q q_loss_td_0 = -tf.reduce_mean(weight_ph * tf.stop_gradient(td_0) * q) q_td_0_loss = tf.reduce_mean(weight_ph * tf.square(td_0)) # TODO: 这里对正向差异 (R-Q) > 0 做截断 # mask = tf.where(return_ph - tf.squeeze(q) > 0.0, # tf.ones_like(return_ph), tf.zeros_like(return_ph)) # TODO: add dis_2_end: return_confidence_factor confidence = tf.pow(FLAGS.return_confidence_factor, dis_2_end_ph) # td_n = (return_ph * confidence - q) * mask # TODO: add clip here... # td_n = tf.clip_by_value(return_ph * confidence - q, 0., 4.) * mask td_n = tf.clip_by_value(return_ph * confidence - q, 0., 4.) q_loss_monte_carlo = -tf.reduce_mean( weight_ph * tf.stop_gradient(td_n) * q) # q_td_n_loss = tf.reduce_mean(weight_ph * tf.square((return_ph * confidence - q) * mask)) q_td_n_loss = tf.reduce_mean(weight_ph * tf.square(td_n)) loss = q_loss_td_0 + lambda1_ph * q_loss_monte_carlo + reg_loss # loss = q_td_0_loss + lambda1_ph * q_td_n_loss + lambda2_ph * margin_classification_loss + reg_loss q_values = U.function(train_obs_input + train_action_input, q) # target network target_q = q_func(q_input, 1, scope="target_q_func", num_units=q_num_units, layer_norm=layer_norm)[:, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) update_target_q = make_update_exp(q_func_vars, target_q_func_vars) target_q_values = U.function(train_obs_input + train_action_input, target_q) # build optimizer optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions train = U.function( inputs=train_obs_input + train_action_input + [target_ph] + [ weight_ph, lambda1_ph, dis_2_end_ph, return_ph, ], outputs=[], # outputs=[loss, q_loss_td_0, q_loss_monte_carlo, margin_classification_loss, reg_loss, # q_td_0_loss, q_td_n_loss], updates=[optimize_expr]) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def discriminator_train(obs_shape_n, act_space_n, agent_index, discriminator_func, optimizer, grad_norm_clipping=None, scope="trainer", reuse=None): with tf.variable_scope(scope, reuse=reuse): expert_dis_2_end_ph = tf.placeholder(tf.float32, [None], name="expert_dis_2_end") policy_dis_2_end_ph = tf.placeholder(tf.float32, [None], name="policy_dis_2_end") state_action_confidence_factor_ph = tf.placeholder(tf.float32, shape=[], name='state_action_confidence_factor') # create act distributions act_pdtype = make_pdtype(act_space_n[agent_index]) # set up placeholders expert_act_ph = act_pdtype.sample_placeholder([None], name="expert_action" + str(agent_index)) expert_state_ph = U.BatchInput(obs_shape_n[agent_index], name="expert_observation" + str(agent_index)).get() policy_act_ph = act_pdtype.sample_placeholder([None], name="policy_action" + str(agent_index)) policy_state_ph = U.BatchInput(obs_shape_n[agent_index], name="policy_observation" + str(agent_index)).get() # input for discriminator expert_input = tf.concat([expert_state_ph, expert_act_ph], 1) d_model_real, d_logits_real = discriminator_func(expert_input, scope="discriminator", num_units=FLAGS.num_units) policy_input = tf.concat([policy_state_ph, policy_act_ph], 1) d_model_fake, d_logits_fake = discriminator_func(policy_input, scope="discriminator", reuse=True, num_units=FLAGS.num_units) discriminator_func_vars = U.scope_vars(U.absolute_scope_name("discriminator")) # Calculate losses # To help the discriminator generalize better, the labels are reduced a bit from 1.0 to 0.9, # for example, using the parameter smooth. This is known as label smoothing, typically used with classifiers # to improve performance. smooth = 0.1 if FLAGS.consider_state_action_confidence: print('consider state action confidence...') expert_confidence = tf.pow(state_action_confidence_factor_ph, expert_dis_2_end_ph) expert_confidence_sum = tf.reduce_sum(expert_confidence) d_loss_real = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_real, labels=tf.ones_like(d_logits_real) * ( 1 - smooth)) * expert_confidence) / expert_confidence_sum policy_confidence = tf.pow(state_action_confidence_factor_ph, policy_dis_2_end_ph) policy_confidence_sum = tf.reduce_sum(policy_confidence) d_loss_fake = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_fake, labels=tf.zeros_like( d_logits_fake)) * policy_confidence) / policy_confidence_sum else: print("doesn't consider state action confidence...") d_loss_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_real, labels=tf.ones_like(d_logits_real) * (1 - smooth))) d_loss_fake = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_fake, labels=tf.zeros_like(d_logits_fake))) d_loss = d_loss_real + d_loss_fake # build optimizer update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name) update_ops_q = [item for item in update_ops if item.name.find('discriminator') != -1] print('discriminator-func, batch norm update parameters: ', update_ops_q) print("all update options: ", tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.control_dependencies(update_ops_q): optimize_expr = U.minimize_and_clip(optimizer, d_loss, discriminator_func_vars, grad_norm_clipping) # Create callable functions train = U.function( inputs=[expert_state_ph, expert_act_ph, policy_state_ph, policy_act_ph, state_action_confidence_factor_ph, expert_dis_2_end_ph, policy_dis_2_end_ph], outputs=[d_loss, d_loss_fake], updates=[optimize_expr]) # d_model_fake_values = U.function([policy_state_ph, policy_act_ph], outputs=d_model_fake) # -np.log(0.99)=0.01, -np.log(0.01)=4.61 d_model_fake_clipped = tf.clip_by_value(d_model_fake, 0.01, 0.99) # d_model_fake_clipped = tf.clip_by_value(d_model_fake, 0.1, 0.9) # TODO: clip reward to [-0.5, 1.5] imitation_reward = tf.clip_by_value(-tf.log(1. - d_model_fake_clipped)[:, 0] + np.log(0.5), -0.5, 1.5) # if args.subtract_baseline: # imitation_reward = -tf.log(1. - d_model_fake_clipped)[:, 0] + np.log(0.5) # else: # imitation_reward = -tf.log(1. - d_model_fake_clipped)[:, 0] imitation_reward_values = U.function([policy_state_ph, policy_act_ph], outputs=imitation_reward) return train, imitation_reward_values