Esempio n. 1
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            layer_norm=True):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        weight_ph = tf.placeholder(tf.float32, [None], name="important_weight")
        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=FLAGS.num_units,
                   layer_norm=layer_norm)

        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        reg_loss = tf.contrib.layers.apply_regularization(
            tf.contrib.layers.l2_regularizer(FLAGS.lambda2), p_func_vars)

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        # TODO: 这里添加了 deterministic action
        determin_act_sample, act_sample = act_pd.sample(deterministic=True)
        # p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample(
        )  # act_pd.mode() sample action from current policy

        # build q-function input
        # print("no adv state info, no adv action info ...")
        if p_index < FLAGS.num_adversaries:  # predator
            q_input = tf.concat(
                obs_ph_n[:FLAGS.num_adversaries] +
                act_input_n[:FLAGS.num_adversaries], 1)
            train_obs_input = obs_ph_n[:FLAGS.num_adversaries]
            train_action_input = act_ph_n[:FLAGS.num_adversaries]
        else:
            q_input = tf.concat(
                obs_ph_n[FLAGS.num_adversaries:] +
                act_input_n[FLAGS.num_adversaries:], 1)
            train_obs_input = obs_ph_n[FLAGS.num_adversaries:]
            train_action_input = act_ph_n[FLAGS.num_adversaries:]

        q_num_units = FLAGS.num_units_ma  # cell number for maddpg
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
            q_num_units = FLAGS.num_units  # cell number for ddpg

        q = q_func(q_input,
                   1,
                   scope="q_func",
                   reuse=True,
                   num_units=q_num_units,
                   layer_norm=layer_norm)[:, 0]
        # pg_loss = -tf.reduce_mean(q * weight_ph)
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + reg_loss
        # loss = pg_loss

        # return
        act = U.function(inputs=[obs_ph_n[p_index]],
                         outputs=[act_sample, determin_act_sample])
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=FLAGS.num_units,
                          layer_norm=layer_norm)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        # build optimizer
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(
            inputs=train_obs_input + train_action_input + [weight_ph],
            # outputs=[loss, pg_loss, distance, reg_loss],
            outputs=[],
            updates=[optimize_expr])

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act,
            'act_pdtype': act_pdtype_n[p_index]
        }
Esempio n. 2
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            layer_norm=True):
    with tf.variable_scope(scope, reuse=reuse):
        # create distributions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")
        return_ph = tf.placeholder(tf.float32, [None], name="return")
        dis_2_end_ph = tf.placeholder(tf.float32, [None], name="dis_2_end")
        lambda1_ph = tf.placeholder(tf.float32, shape=[], name='lambda1')
        weight_ph = tf.placeholder(tf.float32, [None], name="important_weight")
        # build q-function input
        if q_index < FLAGS.num_adversaries:  # predator
            q_input = tf.concat(
                obs_ph_n[:FLAGS.num_adversaries] +
                act_ph_n[:FLAGS.num_adversaries], 1)
            train_obs_input = obs_ph_n[:FLAGS.num_adversaries]
            train_action_input = act_ph_n[:FLAGS.num_adversaries]
        else:
            q_input = tf.concat(
                obs_ph_n[FLAGS.num_adversaries:] +
                act_ph_n[FLAGS.num_adversaries:], 1)
            train_obs_input = obs_ph_n[FLAGS.num_adversaries:]
            train_action_input = act_ph_n[FLAGS.num_adversaries:]

        q_num_units = FLAGS.num_units_ma  # cell number for maddpg
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
            q_num_units = FLAGS.num_units  # cell number for ddpg

        q = q_func(q_input,
                   1,
                   scope="q_func",
                   num_units=q_num_units,
                   layer_norm=layer_norm)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        reg_loss = tf.contrib.layers.apply_regularization(
            tf.contrib.layers.l2_regularizer(FLAGS.lambda2), q_func_vars)

        # TODO: for using prioritized replay buffer, adding weight
        td_0 = target_ph - q
        q_loss_td_0 = -tf.reduce_mean(weight_ph * tf.stop_gradient(td_0) * q)
        q_td_0_loss = tf.reduce_mean(weight_ph * tf.square(td_0))

        # TODO: 这里对正向差异 (R-Q) > 0 做截断
        # mask = tf.where(return_ph - tf.squeeze(q) > 0.0,
        #                 tf.ones_like(return_ph), tf.zeros_like(return_ph))
        # TODO: add dis_2_end: return_confidence_factor
        confidence = tf.pow(FLAGS.return_confidence_factor, dis_2_end_ph)
        # td_n = (return_ph * confidence - q) * mask
        # TODO: add clip here...
        # td_n = tf.clip_by_value(return_ph * confidence - q, 0., 4.) * mask
        td_n = tf.clip_by_value(return_ph * confidence - q, 0., 4.)
        q_loss_monte_carlo = -tf.reduce_mean(
            weight_ph * tf.stop_gradient(td_n) * q)
        # q_td_n_loss = tf.reduce_mean(weight_ph * tf.square((return_ph * confidence - q) * mask))
        q_td_n_loss = tf.reduce_mean(weight_ph * tf.square(td_n))

        loss = q_loss_td_0 + lambda1_ph * q_loss_monte_carlo + reg_loss
        # loss = q_td_0_loss + lambda1_ph * q_td_n_loss + lambda2_ph * margin_classification_loss + reg_loss

        q_values = U.function(train_obs_input + train_action_input, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=q_num_units,
                          layer_norm=layer_norm)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(train_obs_input + train_action_input,
                                     target_q)

        # build optimizer
        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(
            inputs=train_obs_input + train_action_input + [target_ph] + [
                weight_ph,
                lambda1_ph,
                dis_2_end_ph,
                return_ph,
            ],
            outputs=[],
            # outputs=[loss, q_loss_td_0, q_loss_monte_carlo, margin_classification_loss, reg_loss,
            #          q_td_0_loss, q_td_n_loss],
            updates=[optimize_expr])

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Esempio n. 3
0
def discriminator_train(obs_shape_n, act_space_n, agent_index, discriminator_func, optimizer, grad_norm_clipping=None,
                        scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        expert_dis_2_end_ph = tf.placeholder(tf.float32, [None], name="expert_dis_2_end")
        policy_dis_2_end_ph = tf.placeholder(tf.float32, [None], name="policy_dis_2_end")
        state_action_confidence_factor_ph = tf.placeholder(tf.float32, shape=[], name='state_action_confidence_factor')

        # create act distributions
        act_pdtype = make_pdtype(act_space_n[agent_index])
        # set up placeholders
        expert_act_ph = act_pdtype.sample_placeholder([None], name="expert_action" + str(agent_index))
        expert_state_ph = U.BatchInput(obs_shape_n[agent_index], name="expert_observation" + str(agent_index)).get()

        policy_act_ph = act_pdtype.sample_placeholder([None], name="policy_action" + str(agent_index))
        policy_state_ph = U.BatchInput(obs_shape_n[agent_index], name="policy_observation" + str(agent_index)).get()

        # input for discriminator
        expert_input = tf.concat([expert_state_ph, expert_act_ph], 1)
        d_model_real, d_logits_real = discriminator_func(expert_input, scope="discriminator",
                                                         num_units=FLAGS.num_units)
        policy_input = tf.concat([policy_state_ph, policy_act_ph], 1)
        d_model_fake, d_logits_fake = discriminator_func(policy_input, scope="discriminator", reuse=True,
                                                         num_units=FLAGS.num_units)

        discriminator_func_vars = U.scope_vars(U.absolute_scope_name("discriminator"))

        # Calculate losses
        # To help the discriminator generalize better, the labels are reduced a bit from 1.0 to 0.9,
        # for example, using the parameter smooth. This is known as label smoothing, typically used with classifiers
        # to improve performance.
        smooth = 0.1
        if FLAGS.consider_state_action_confidence:
            print('consider state action confidence...')
            expert_confidence = tf.pow(state_action_confidence_factor_ph, expert_dis_2_end_ph)
            expert_confidence_sum = tf.reduce_sum(expert_confidence)
            d_loss_real = tf.reduce_sum(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_real,
                                                        labels=tf.ones_like(d_logits_real) * (
                                                                1 - smooth)) * expert_confidence) / expert_confidence_sum

            policy_confidence = tf.pow(state_action_confidence_factor_ph, policy_dis_2_end_ph)
            policy_confidence_sum = tf.reduce_sum(policy_confidence)
            d_loss_fake = tf.reduce_sum(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_fake,
                                                        labels=tf.zeros_like(
                                                            d_logits_fake)) * policy_confidence) / policy_confidence_sum
        else:
            print("doesn't consider state action confidence...")
            d_loss_real = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_real,
                                                        labels=tf.ones_like(d_logits_real) * (1 - smooth)))
            d_loss_fake = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=d_logits_fake,
                                                        labels=tf.zeros_like(d_logits_fake)))
        d_loss = d_loss_real + d_loss_fake

        # build optimizer
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
        update_ops_q = [item for item in update_ops if item.name.find('discriminator') != -1]
        print('discriminator-func, batch norm update parameters: ', update_ops_q)
        print("all update options: ", tf.get_collection(tf.GraphKeys.UPDATE_OPS))
        with tf.control_dependencies(update_ops_q):
            optimize_expr = U.minimize_and_clip(optimizer, d_loss, discriminator_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(
            inputs=[expert_state_ph, expert_act_ph, policy_state_ph, policy_act_ph, state_action_confidence_factor_ph,
                    expert_dis_2_end_ph, policy_dis_2_end_ph],
            outputs=[d_loss, d_loss_fake],
            updates=[optimize_expr])

        # d_model_fake_values = U.function([policy_state_ph, policy_act_ph], outputs=d_model_fake)
        # -np.log(0.99)=0.01, -np.log(0.01)=4.61
        d_model_fake_clipped = tf.clip_by_value(d_model_fake, 0.01, 0.99)
        # d_model_fake_clipped = tf.clip_by_value(d_model_fake, 0.1, 0.9)
        # TODO: clip reward to [-0.5, 1.5]
        imitation_reward = tf.clip_by_value(-tf.log(1. - d_model_fake_clipped)[:, 0] + np.log(0.5), -0.5, 1.5)
        # if args.subtract_baseline:
        #     imitation_reward = -tf.log(1. - d_model_fake_clipped)[:, 0] + np.log(0.5)
        # else:
        #     imitation_reward = -tf.log(1. - d_model_fake_clipped)[:, 0]
        imitation_reward_values = U.function([policy_state_ph, policy_act_ph], outputs=imitation_reward)
        return train, imitation_reward_values