Beispiel #1
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            lstm_model,
            optimizer,
            args,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            use_lstm=True):
    with tf.variable_scope(scope, reuse=reuse):
        # ===================q network开始建图=================
        batch_size = tf.placeholder(tf.int32, shape=[], name="bs")
        # 创建分布
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # 创建观测placeholder
        obs_ph_n = make_obs_ph_n  # set up placeholders
        # 在这里进行dimension reduction
        if use_lstm:
            observation_n = lstm_model(obs_ph_n, scope="lstm", reuse=reuse)
        else:
            observation_n = [tf.squeeze(o, 2) for o in obs_ph_n]
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None],
                                   name="target")  # 在运行时计算,然后传入,只跟loss有关
        # 所有智能体的obs和action
        if local_q_func:
            q_input = tf.concat([observation_n[q_index], act_ph_n[q_index]], 1)
        else:
            q_input = tf.concat(observation_n + act_ph_n, 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,
                                                                    0]  # 计算q值
        q_func_vars = U.scope_vars(
            U.absolute_scope_name("q_func"))  # q network网络参数
        if use_lstm:
            lstm_func_vars = U.scope_vars(
                U.absolute_scope_name("lstm"))  # lstm参数

        q_loss = tf.reduce_mean(tf.square(q - target_ph))  # mse loss
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  # + 1e-3 * q_reg
        if use_lstm:
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + lstm_func_vars,
                                                grad_norm_clipping)
        else:
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)
        # ===============q network建图结束=====================

        # 创建可调用函数
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                           [batch_size],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n + [batch_size], q)

        # ==================target q network建图===============
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        # ===================target q network建图结束======================

        # 创建可调用函数
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
        target_q_values = U.function(obs_ph_n + act_ph_n + [batch_size],
                                     target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Beispiel #2
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_scope,
            p_index,
            p_func,
            q_func,
            lstm_model,
            optimizer,
            args,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            use_lstm=True):
    with tf.variable_scope(scope, reuse=reuse):
        # batch size的placeholder, []
        batch_size = tf.placeholder(tf.int32, shape=[], name="bs")
        # 创建action的分布用来采样
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # 创建observation的placeholder # list of [batch_size, dim, time_step]
        obs_ph_n = make_obs_ph_n
        # action的placeholder, list of [batch_size, action_dim
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # 所有智能体的obs, list of [batch_size, state_dim]
        if use_lstm:
            observation_n = lstm_model(obs_ph_n, reuse=reuse, scope="lstm")
        else:
            observation_n = [tf.squeeze(o, 2) for o in obs_ph_n]
        # 当前智能体的局部obs, [batch_size, state_dim]
        p_input = observation_n[p_index]

        # 计算局部p值,最后用来产生action, [batch_size, action_dim]
        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   reuse=reuse,
                   num_units=num_units)
        # p函数的参数
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        if use_lstm:
            lstm_vars = U.scope_vars(U.absolute_scope_name("lstm"))
        # wrap parameters in distribution,Pd.logits
        act_pd = act_pdtype_n[p_index].pdfromflat(p)  #
        # act_sample = act_pd.sample()    # [batch_size, action_dim]

        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))  # [None]

        # 更新action
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()  # [batch_size, action]
        # 所有智能体的s和a, [batch_size, concat_dim]
        q_input = tf.concat(observation_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        # 计算Q(s,a), [batch_size,]

    with tf.variable_scope(p_scope, reuse=reuse):
        q = q_func(q_input,
                   1,
                   scope="q_func",
                   reuse=reuse,
                   num_units=num_units)[:, 0]

    with tf.variable_scope(scope + "_" + p_scope, reuse=False):
        pg_loss = -tf.reduce_mean(q)  # policy gradient loss ???
        loss = pg_loss + p_reg * 1e-3  # 使用每一个critic计算的loss都是不同的,第一次需要建图,以后就不需要了
        # p网络的优化器。
        if use_lstm:
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                p_func_vars + lstm_vars,
                                                grad_norm_clipping)
        else:
            optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                                grad_norm_clipping)
        # ============p network建图结束=================

        # 创建可以调用的函数,就是往里面喂数据
        # train的调用函数,输入必须是list,
        train = U.function(inputs=obs_ph_n + act_ph_n + [batch_size],
                           outputs=loss,
                           updates=[optimize_expr])

        return train
Beispiel #3
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_scope,
            p_index,
            p_func,
            q_func,
            lstm_model,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            use_lstm=False):
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        # batch size的placeholder, []
        batch_size = tf.placeholder(tf.int32, shape=[], name="bs")
        # action placeholder
        act_pdtype_n = [make_pdtype(act_space)
                        for act_space in act_space_n]  # create distribtuions
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # observation placeholder
        obs_ph_n = make_obs_ph_n

        if use_lstm:
            observation_n = lstm_model(obs_ph_n, reuse=reuse, scope="lstm")
        else:
            observation_n = [tf.squeeze(o, 2) for o in obs_ph_n]
        p_input = observation_n[p_index]

    with tf.variable_scope(p_scope, reuse=tf.AUTO_REUSE):
        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()  # act_pd.mode() #

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()

    with tf.variable_scope(scope, reuse=reuse):
        q_input = tf.concat(observation_n + act_input_n,
                            1)  # 所有智能体的s和a, [batch_size, concat_dim]
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]

        pg_loss = -tf.reduce_mean(q)
        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [batch_size],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]] + [batch_size],
                         outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]] + [batch_size], p)
        target_act = U.function(inputs=[obs_ph_n[p_index]] + [batch_size],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Beispiel #4
0
def p_act(make_obs_ph_n,
          act_space_n,
          p_index,
          p_func,
          lstm_model,
          args,
          num_units=64,
          scope="trainer",
          reuse=None,
          use_lstm=True):
    with tf.variable_scope(scope, reuse=reuse):
        # ============p network建图=================
        # batch size的placeholder, []
        batch_size = tf.placeholder(tf.int32, shape=[], name="bs")
        # 创建action的分布用来采样
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # 创建observation的placeholder # list of [batch_size, dim, time_step]
        obs_ph_n = make_obs_ph_n
        # 所有智能体的obs, list of [batch_size, state_dim]
        if use_lstm:
            observation_n = lstm_model(obs_ph_n, reuse=reuse, scope="lstm")
        else:
            observation_n = [tf.squeeze(o, 2) for o in obs_ph_n]
        # 当前智能体的局部obs, [batch_size, state_dim]
        p_input = observation_n[p_index]

        # 计算局部p值,最后用来产生action, [batch_size, action_dim]
        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units,
                   reuse=reuse)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        act_pd = act_pdtype_n[p_index].pdfromflat(p)  #
        act_sample = act_pd.sample()  # [batch_size, action_dim]
        # ============p network建图结束=================

        # 调用函数
        # 采样aciton的调用函数
        act = U.function(inputs=[obs_ph_n[p_index]] + [batch_size],
                         outputs=act_sample)
        # 计算p值的调用函数
        p_values = U.function([obs_ph_n[p_index]] + [batch_size], p)

        # ============target p network建图=================
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          reuse=reuse,
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        # 更新目标网络的参数
        # target action, [batch_size, action_dim]
        target_act_pd = act_pdtype_n[p_index].pdfromflat(target_p)
        target_act_sample = target_act_pd.sample()
        # ============p target network建图结束=================

        # 生成调用函数
        # 生成目标action的调用函数
        target_act = U.function(inputs=[obs_ph_n[p_index]] + [batch_size],
                                outputs=target_act_sample)

        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        return act, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Beispiel #5
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            lstm_model,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            use_lstm=False):
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        # batch size placeholder
        batch_size = tf.placeholder(tf.int32, shape=[], name="bs")
        # action placeholder
        act_pdtype_n = [make_pdtype(act_space)
                        for act_space in act_space_n]  # create distribtuions
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # observation placeholder
        obs_ph_n = make_obs_ph_n
        # target q placeholder
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        if use_lstm:
            observation_n = lstm_model(obs_ph_n, scope="lstm", reuse=reuse)
        else:
            observation_n = [tf.squeeze(o, 2) for o in obs_ph_n]
        if local_q_func:
            q_input = tf.concat([observation_n[q_index], act_ph_n[q_index]], 1)
        else:
            q_input = tf.concat(observation_n + act_ph_n, 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        if use_lstm:
            lstm_func_vars = U.scope_vars(
                U.absolute_scope_name("lstm"))  # lstm参数

        q_loss = tf.reduce_mean(tf.square(q - target_ph))
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  # + 1e-3 * q_reg

        if use_lstm:
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + lstm_func_vars,
                                                grad_norm_clipping)
        else:
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                           [batch_size],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n + [batch_size], q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
        target_q_values = U.function(obs_ph_n + act_ph_n + [batch_size],
                                     target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }