Beispiel #1
0
def p_approx_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None,
                    local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n  # [U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))]
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))]
        act_logits_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action_mode" + str(i)) for i in range(len(act_space_n))]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func")
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        #p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
        p_reg = -tf.reduce_mean(act_pd.entropy())

        act_input_n = act_ph_n + []
        act_target = act_input_n[p_index]
        pg_loss = -tf.reduce_mean(act_pd.logp(act_target))

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        sync_target_p = make_update_exp(p_func_vars, target_p_func_vars, rate=1.0)

        target_pd = act_pdtype_n[p_index].pdfromflat(target_p)
        target_act_sample = target_pd.sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        target_ph_pd = act_pdtype_n[p_index].pdfromflat(act_logits_ph_n[p_index])
        kl_loss = tf.reduce_mean(target_pd.kl(target_ph_pd))
        f_kl_loss = U.function(inputs=[obs_ph_n[p_index],act_logits_ph_n[p_index]],outputs=kl_loss)

        return act, train, update_target_p, sync_target_p, {'p_values': p_values, 'kl_loss': f_kl_loss, 'target_act': target_act}
Beispiel #2
0
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None,
            local_q_func=False, num_units=64, scope="trainer", reuse=tf.AUTO_REUSE):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)

        act_test = U.function(inputs=[obs_ph_n[p_index]], outputs=p)

        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act_test, act, train, update_target_p, {'p_values': p_values, 'target_act': target_act,
                                                       'p_vars': p_func_vars, 'target_p_vars': target_p_func_vars}
Beispiel #3
0
def q_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False,
            scope="trainer", reuse=None, num_units=64, discrete_action=False, target_update_tau=0.001, use_global_state=False,
            share_weights=False):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        if not use_global_state:
            obs_ph_n = make_obs_ph_n
        else:
            obs_ph_n = make_state_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        if share_weights:
            # add agent id to input as layers share weights
            q_input = tf.concat([q_input,
                                 tf.tile(tf.eye(n_agents)[q_index:q_index+1],
                                         [tf.shape(q_input)[0], 1])], -1)
        q = q_func(q_input, 1, scope="q_func", reuse=share_weights, num_units=num_units,
                   constrain_out=False, discrete_action=discrete_action)[:, 0] #share_weights)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", reuse=share_weights, num_units=num_units,
                          constrain_out=False, discrete_action=discrete_action)[:, 0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars, target_update_tau)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
Beispiel #4
0
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, shared_CNN, optimizer, make_obs_map_ph_n,grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
    
        # create distribtuions
    act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
    # set up placeholders
    obs_ph_n = make_obs_ph_n

    obs_map_ph_n=make_obs_map_ph_n

    obs_ph_next = obs_ph_n
    act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
    target_ph = tf.placeholder(tf.float32, [None], name="target")
    vf_input = tf.concat(obs_ph_next, 1)
    q_input = tf.concat(obs_ph_n + act_ph_n, 1)

    for i in range(len(obs_ph_n)):
        q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1)
        vf_input=tf.concat([vf_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1)

    with tf.variable_scope(scope, reuse=None):
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        vf = q_func(vf_input, 1, scope="vf_func",num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func"))

        CNN_vars=U.scope_vars(U.absolute_scope_name("CNN"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg
        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars+CNN_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + obs_map_ph_n+act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + obs_map_ph_n+act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + obs_map_ph_n+act_ph_n, target_q)
        target_vf_values = U.function(obs_ph_n +obs_map_ph_n, vf)

        return train, update_target_q, {'q_values': q_values, 'target_vf_values': target_vf_values}
Beispiel #5
0
def q_LSTM_train(make_obs_ph_n, act_space_n, q_index, q_func,
            optimizer, grad_norm_clipping=None, local_q_func=False,
            scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None, 1], name="action"+str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None, 1], name="target")

        q_res = 1
        q_c_ph, q_h_ph = get_lstm_state_ph(name='q_', n_batches=None, num_units=num_units)
        q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n))], [q_h_ph for i in range(len(obs_ph_n))]
        # need to check this -- need safety checks
        # q_input = tf.concat(obs_ph_n + act_ph_n, -1)
        q_input = tf.concat(obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, -1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], -1)

        q, q_state_out = q_func(q_input, 1, scope="q_func", num_units=num_units)
        q = q[:,0]

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        q_values = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=[q, q_state_out])
        train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n+ [target_ph], outputs=loss, updates=[optimize_expr])

        # target network
        target_q, target_q_state_out = q_func(q_input, 1, scope="target_q_func", num_units=num_units)
        target_q = target_q[:,0]

        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
Beispiel #6
0
def make_update_exp(vals, target_vals):
    polyak = 1.0 - 1e-2
    expression = []
    for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
        expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var))
    expression = tf.group(*expression)
    return U.function([], [], updates=[expression])
Beispiel #7
0
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        policy = shana(
        env_spec=env,
        af = 15,
        of = 22,
        K=2,
        hidden_layer_sizes=(100, 100),
        qf=q_func,
        reg=0.001
        )
        actions, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index],
                                                   with_log_pis=True)
        print(actions)
        print(log_pi)
        p_func_vars = policy.get_params_internal()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        pg_loss = -tf.reduce_mean(q)
        p_reg = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES,
            scope=policy.name)
        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
def make_update_exp(vals, target_vals):  # softupdate两个神经网络的变量
    polyak = 1.0 - 1e-2  # 0.99
    expression = []
    for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                               sorted(target_vals, key=lambda v: v.name)):
        expression.append(
            var_target.assign(polyak * var_target +
                              (1.0 - polyak) * var))  # target网络和当前网络
    expression = tf.group(*expression)  # expression被会话调用,其中所有变量均会被调用生效
    return U.function([], [], updates=[expression])  # 更新target_vals网络节点的值
Beispiel #9
0
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False,
            scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n  # [U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))]
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))
        # q_loss = tf.reduce_mean(U.huber_loss(q - target_ph))

        # TEMP: just want to give an viscosity solution to Bellman differential equation
        # in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss + 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
        sync_target_q = make_update_exp(q_func_vars, target_q_func_vars, rate=1.0)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, sync_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
Beispiel #10
0
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) # DDPG just owns its [local] information
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg                                                                 # From chapter 4.2: inferring policies of other policies ???

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)         # [Important]使用 optimizer 来降低 loss, 其中变量表在 q_func_vars 中,
                                                                                                      # 保证每个变量的梯度到 grad_norm_clipping -- 梯度剪切
        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
Beispiel #11
0
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype = make_pdtype(act_space_n[0])

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph = [act_pdtype.sample_placeholder([None], name="action"+str(0))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph[0]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph, target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
Beispiel #12
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index,
                 args):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args

        # create dummy tensor flow variables to avoid Saver error
        # TODO: remove this or turn into act function
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
        with tf.variable_scope(self.name, reuse=None):
            self.dummy_var = U.function(obs_ph_n, outputs=tf.Variable(0))
Beispiel #13
0
def make_update_exp(vals, target_vals, central=True):
    polyak = 1.0 - 1e-2
    expression = []
    agent_n_species = len(vals)
    if central:
        for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                                   sorted(target_vals, key=lambda v: v.name)):
            expression.append(
                var_target.assign(polyak * var_target + (1.0 - polyak) * var))
    else:
        for a in range(agent_n_species):
            for var, var_target in zip(
                    sorted(vals[a], key=lambda v: v.name),
                    sorted(target_vals[a], key=lambda v: v.name)):
                expression.append(
                    var_target.assign(polyak * var_target +
                                      (1.0 - polyak) * var))
    expression = tf.group(*expression)
    return U.function([], [], updates=[expression])
Beispiel #14
0
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()

        # Create callable functions
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)


        return act
Beispiel #15
0
def make_update_exp(vals, target_vals):
    """
    Update target network values using polyak averaging (exponentially decaying average).

    Args:
        vals (tf.Variable): Network variables
        target_vals (tf.Variable): Target network variables

    Returns
        Updated target network values
    """
    # Polyak coefficient for Polyak-averaging of the target network
    polyak = 1.0 - 1e-2

    expression = []
    for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                               sorted(target_vals, key=lambda v: v.name)):
        # Exponentially decaying average
        expression.append(
            var_target.assign(polyak * var_target + (1.0 - polyak) * var))
    expression = tf.group(*expression)

    return tf_util.function([], [], updates=[expression])
Beispiel #16
0
def q_train(make_obs_ph_n,
            act_space_n,
            make_obs_history_n,
            make_act_history_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    """
    Q-Learning

        make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents
        act_space_n (list): A list of the action spaces for all agents
        make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents
        make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents
        q_index (int): Agent index number
        q_func (function): MLP Neural Network model for the agent.
        optimizer (function): Network Optimizer function
        grad_norm_clipping (float): Value by which to clip the norm of the gradient
        local_q_func (boolean): Flag for using local q function
        num_units (int): The number outputs for the layers of the model
        scope (str): The name of the scope
        reuse (boolean): Flag specifying whether to reuse the scope

    Returns:
        train (function): Training function for Q network
        update_target_q (function): Update function for updating Q network values
        q_debug (dict): Contains 'q_values' and 'target_q_values' of the Q network
    """
    with tf.variable_scope(scope, reuse=reuse):
        # Create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # Set up placeholders
        obs_ph_n = make_obs_ph_n
        obs_history_n = make_obs_history_n
        act_history_n = make_act_history_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)]
        # act_ph_n = [tf.concat(3*[x],1,name="action{}".format(i)) for i,x in enumerate(act_ph_n)]

        # Original implementation
        # q_input = tf.concat(obs_ph_n + act_ph_n, 1)

        # Modified
        # Current plus 2 previous time-steps
        q_input = tf.concat(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1)

        if local_q_func:
            # Only have observations about myself when 'ddpg'
            # Importantly... self position is relative to prey
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)

        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("q_func"))

        # ************************************************************************************************

        # ccm_input = data for ccm
        # ccm_value = ccm_func(ccm_input)

        # ************************************************************************************************

        # q_loss = tf.reduce_mean(tf.square(q - target_ph)) - ccm_loss

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # Viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        # loss = q_loss + 1e-3 * q_reg
        loss = q_loss

        optimize_expr = tf_util.minimize_and_clip(optimizer, loss, q_func_vars,
                                                  grad_norm_clipping)

        # Create callable functions
        train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n +
                                 act_history_n + [target_ph],
                                 outputs=loss,
                                 updates=[optimize_expr])
        q_values = tf_util.function(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, q)

        # Target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = tf_util.scope_vars(
            tf_util.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = tf_util.function(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Beispiel #17
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        # Gradient computation mods
        # ---------------------------------------------------------------------------------------------
        obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])]
        act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])]
        obs_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + obs_flat_shape,
                                     name="obs_flat_input")
        act_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + act_flat_shape,
                                     name="act_flat_input")

        q_vec_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1)
        serial_q = q_func(q_vec_input,
                          1,
                          scope="q_func",
                          reuse=True,
                          num_units=num_units)[:, 0]

        # calculate gradient of serial q value wrt actions
        raw_grad = tf.gradients(serial_q, act_flat_ph)
        grad_norm = tf.divide(raw_grad, tf.norm(raw_grad))
        grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm)
        # ---------------------------------------------------------------------------------------------

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act,
            'grad_norm_value': grad_norm_value
        }
Beispiel #18
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        # get flattened obs and act shape
        act_shape = tf.shape(act_ph_n)
        act_serial = tf.concat(act_ph_n, 1)
        act_serial = tf.reshape(act_serial,
                                [act_shape[1], act_shape[0] * act_shape[-1]])
        act_serial_values = U.function(act_ph_n, act_serial)

        obs_shape = tf.shape(obs_ph_n)
        obs_serial = tf.concat(obs_ph_n, 1)
        obs_serial = tf.reshape(obs_serial,
                                [obs_shape[1], obs_shape[0] * obs_shape[-1]])
        obs_serial_values = U.function(obs_ph_n, obs_serial)

        obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])]
        act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])]
        obs_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + obs_flat_shape,
                                     name="obs_flat_input")
        act_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + act_flat_shape,
                                     name="act_flat_input")

        target_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1)
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        # target_orig_q = q_func(q_input, 1, scope="target_orig_q_func", num_units=num_units)[:,0]
        target_q = q_func(target_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        # target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        target_q_values = U.function([obs_flat_ph, act_flat_ph], target_q)

        # calculate gradient of target q value wrt actions
        raw_grad = tf.gradients(target_q, act_flat_ph)
        grad_norm = tf.divide(raw_grad, tf.norm(raw_grad))
        grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values,
            'act_serial_values': act_serial_values,
            'obs_serial_values': obs_serial_values,
            'grad_norm_value': grad_norm_value
        }
Beispiel #19
0
def p_train_recurrent(make_obs_ph_n,
                      make_state_ph_n,
                      make_obs_next_n,
                      make_obs_pred_n,
                      act_space_n,
                      p_index,
                      p_policy,
                      p_predict,
                      q_func,
                      optimizer,
                      grad_norm_clipping=None,
                      local_q_func=False,
                      num_units=64,
                      scope="trainer",
                      reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions

        # set up placeholders
        obs_ph_n = make_obs_ph_n  # all obs, in shape Agent_num * batch_size * time_step * obs_shape
        obs_next_n = make_obs_next_n
        state_ph_n = make_state_ph_n
        obs_pred_n = make_obs_pred_n

        # used for action
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        # p_input is local obs of an agent
        obs_input = obs_ph_n[p_index]
        state_input = state_ph_n[p_index]
        act_input = act_ph_n[p_index]
        obs_next = obs_next_n[p_index]
        obs_pred_input = obs_pred_n[p_index]

        # get output and state
        p, gru_out, state = p_policy(
            obs_input,
            state_input,
            obs_pred_input,
            int(act_pdtype_n[p_index].param_shape()[0]),
            scope="p_policy",
            num_units=num_units)
        act_pd = act_pdtype_n[p_index].pdfromflat(
            p)  # wrap parameters in distribution
        act_sample = act_pd.sample()  # sample an action

        # predict the next obs
        obs_pred = p_predict(act_input,
                             gru_out,
                             int(obs_input.shape[1]),
                             scope="p_predict",
                             num_units=num_units)

        # variables for optimization
        p_func_vars = U.scope_vars(
            U.absolute_scope_name("p_policy")) + U.scope_vars(
                U.absolute_scope_name("p_predict"))

        pred_loss = tf.reduce_mean(tf.square(obs_next -
                                             obs_pred))  # predict loss
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))  # reg item
        # use critic net to get the loss about policy
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample(
        )  # only modify the action of this agent
        q_input = tf.concat(
            obs_ph_n + act_input_n,
            1)  # get the input for Q net (all obs + all action)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]  # get q values
        pg_loss = -tf.reduce_mean(q)  # calculate loss to maximize Q values

        loss = pg_loss + p_reg * 1e-3 + pred_loss * 1e-3
        optimize_expr = U.minimize_and_clip(
            optimizer, loss, p_func_vars,
            grad_norm_clipping)  # update p Net parameters

        # Create callable functions
        # update P NET
        train = U.function(inputs=obs_ph_n + state_ph_n + act_ph_n +
                           obs_next_n + obs_pred_n,
                           outputs=loss,
                           updates=[optimize_expr])
        # return action and state
        step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] +
                          [obs_pred_n[p_index]],
                          outputs=[act_sample] + [state] + [gru_out])
        p_values = U.function(inputs=[obs_ph_n[p_index]] +
                              [state_ph_n[p_index]] + [obs_pred_n[p_index]],
                              outputs=p)

        # target network
        target_p, target_gru_out, target_state = \
            p_policy(obs_input, state_input, obs_pred_input,
                     int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_policy", num_units=num_units)
        target_obs_pred = p_predict(act_input,
                                    target_gru_out,
                                    int(obs_input.shape[1]),
                                    scope="target_p_predict",
                                    num_units=num_units)

        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_policy")) + \
                             U.scope_vars(U.absolute_scope_name("target_p_predict"))
        # update the parameters θ'i = τθi + (1 − τ)θ'i
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()

        target_step = U.function(inputs=[obs_ph_n[p_index]] +
                                 [state_ph_n[p_index]] + [obs_pred_n[p_index]],
                                 outputs=[target_act_sample] + [target_state] +
                                 [target_gru_out])

        # return predicted obs
        gru_temp = tf.placeholder(tf.float32, [None] + [num_units],
                                  name='gru_out')
        pred_temp = p_predict(act_input,
                              gru_temp,
                              int(obs_input.shape[1]),
                              scope="p_predict",
                              num_units=num_units)
        predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp],
                             outputs=pred_temp)
        target_pred_temp = p_predict(act_input,
                                     gru_temp,
                                     int(obs_input.shape[1]),
                                     scope="target_p_predict",
                                     num_units=num_units)
        target_predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp],
                                    outputs=target_pred_temp)

        return step, predict, train, update_target_p, {
            'p_values': p_values,
            'target_step': target_step,
            'target_predict': target_predict
        }
Beispiel #20
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            scope="coma_trainer",
            reuse=None,
            num_units=64,
            num_outputs=1):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            tf.placeholder(tf.int32, [None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = [
            tf.placeholder(tf.float32, [None], name="target")
            for _ in range(len(act_space_n))
        ]

        # 在一维进行拼接
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        q = q_func(q_input,
                   num_outputs,
                   scope="coma_q_func",
                   num_units=num_units)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          num_outputs,
                          scope="coma_target_q_func",
                          num_units=num_units)
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Beispiel #21
0
def dqn_train(make_obs_ph_n,
              act_space_n,
              p_index,
              p_func,
              q_func,
              optimizer,
              grad_norm_clipping=None,
              local_q_func="dqn",
              num_units=64,
              scope="trainer",
              reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_sample = act_pd.sample()

        target_ph = tf.placeholder(tf.float32, [None], name="target")

        tf_p = tf.reduce_sum(p, reduction_indices=1)
        loss = tf.reduce_mean(tf.square(tf_p - target_ph))
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Beispiel #22
0
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, shared_CNN, optimizer, make_obs_map_ph_n,grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        
        obs_map_ph_n=make_obs_map_ph_n
        p_map_input=obs_map_ph_n[p_index]
        
        policy = shana(
        env_spec=env,
        af = 15,
        of = 25+12,
        K=2,
        hidden_layer_sizes=(128, 128),
        qf=q_func,
        reg=0.001
        )

        map_context_input=shared_CNN(p_map_input,p_index,scope="CNN")
        CNN_vars=U.scope_vars(U.absolute_scope_name("CNN"))

        act, log_pi = policy.actions_for(observations=tf.concat([make_obs_ph_n[p_index],map_context_input],1),
                                                   with_log_pis=True)
        act_input_n = act_ph_n + []
        
        act_input_n[p_index]=act

        p_func_vars = policy.get_params_internal()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        vf_input = tf.concat(obs_ph_n, 1)

    for i in range(len(obs_ph_n)):
        q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1)
        vf_input=tf.concat([vf_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1)
    
    with tf.variable_scope(scope, reuse=None):
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        vf = q_func(vf_input, 1, scope="vf_func",reuse=True, num_units=num_units)[:,0]
        vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func"))
        pg_loss = tf.reduce_mean(log_pi * tf.stop_gradient(log_pi - q + vf))
        p_reg = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES,
            scope=policy.name)
        loss = pg_loss + p_reg
        vf_loss = 0.5 * tf.reduce_mean((vf - tf.stop_gradient(q - log_pi))**2)
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)
        mikoto = U.minimize_and_clip(optimizer, vf_loss, vf_func_vars, grad_norm_clipping)
    
    with tf.variable_scope(scope, reuse=True):
        CNN_p_optimizer=U.minimize_and_clip(optimizer, loss, CNN_vars, grad_norm_clipping)
        CNN_v_optimizaer=U.minimize_and_clip(optimizer, vf_loss, CNN_vars, grad_norm_clipping)
    
    with tf.variable_scope(scope, reuse=None):
        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n+obs_map_ph_n, outputs=loss, updates=[optimize_expr,CNN_p_optimizer])
        misaka = U.function(inputs=obs_ph_n + act_ph_n+obs_map_ph_n, outputs=loss, updates=[mikoto,CNN_v_optimizaer])
        # target network
        target_p = shana(
        env_spec=env,
        af = 15,
        of = 25+12,
        K=2,
        hidden_layer_sizes=(128, 128),
        qf=q_func,
        reg=0.001,
        name = 'target_policy'
        )
        target_p_func_vars = target_p.get_params_internal()
        target_vf = q_func(vf_input, 1, scope="target_vf_func", num_units=num_units)[:,0]
        target_vf_func_vars = U.scope_vars(U.absolute_scope_name("target_vf_func"))
        target_act_r, tar_log = target_p.actions_for(observations=tf.concat([obs_ph_n[p_index],map_context_input],1),with_log_pis=True)
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        upvf = make_update_exp(vf_func_vars, target_vf_func_vars)
        target_act = U.function(inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]], outputs=target_act_r)
        act=U.function(inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]], outputs=act)
        return act, train, misaka, update_target_p, upvf, {'target_act': target_act}
Beispiel #23
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            num_outputs,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="coma_trainer",
            reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        # act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        act_ph_n = [
            tf.placeholder(tf.int32, [None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        # actor的输入为本地的obs
        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="coma_p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        # 得到各个action的概率
        act_sample = act_pd.sample()
        # sample操作即gumble softmax  coma训练需要某个特定的动作,所以需要一个argmax操作
        act_picked = [act.tolist().index(max(act)) for act in act_sample]

        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        # 为什么要加一个[]
        act_input_n = act_ph_n + []
        # 动作概率分布  替换当前agent的动作
        act_input_n[p_index] = act_picked
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input,
                   num_outputs,
                   scope="coma_q_func",
                   reuse=True,
                   num_units=num_units)

        # 反事实基线
        baseline = [
            baseline_calculation(act_distribute, q_list)
            for act_distribute, q_list in zip(act_sample, q)
        ]
        # 根据真实采取的动作获得q
        actual_picked_q = [q_list[act] for act, q_list in zip(act_picked, q)]
        # 计算当前动作的q相对于反事实基线的差值
        a = [q - b for q, b in zip(actual_picked_q, baseline)]

        pg_loss = -tf.reduce_mean(a)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="coma_target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Beispiel #24
0
def c_next(make_obs_ph,
           act_space,
           c_ph,
           c_next_func,
           num_constraints,
           optimizer,
           grad_norm_clipping,
           num_units=64,
           reuse=False,
           scope="c_next"):
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        act_pdtype = make_pdtype(act_space[0])
        obs_ph = make_obs_ph
        act_ph = act_pdtype.sample_placeholder([None], name="action")
        c_next_target_ph = []
        for _ in range(num_constraints):
            c_next_target_ph.append(
                tf.placeholder(tf.float32, [None, 1], name="target" + str(_)))

        c_next_input = tf.concat(obs_ph, 1)
        gs_ = []
        for _ in range(num_constraints):
            gs_.append(
                c_next_func(c_next_input,
                            int((act_pdtype.param_shape()[0]) / 2),
                            scope="c_next_func" + str(_),
                            num_units=num_units))

        c_ = []  # to be testified
        for _ in range(num_constraints):
            temp = c_ph[_] + tf.multiply(gs_[_], act_ph)
            c_.append(tf.reduce_sum(temp, -1))

        c_next_vars = [
            U.scope_vars(U.absolute_scope_name("c_next_func" + str(_)))
            for _ in range(num_constraints)
        ]

        diff = [(c_[_] - c_next_target_ph[_]) for _ in range(num_constraints)]
        c_next_loss = [
            tf.reduce_mean(tf.square(diff[_])) for _ in range(num_constraints)
        ]

        optimize_expr = [
            U.minimize_and_clip(optimizer, c_next_loss[_], c_next_vars[_],
                                grad_norm_clipping)
            for _ in range(num_constraints)
        ]

        # Create callable functions
        train = [
            U.function(inputs=[obs_ph] + [act_ph] + [c_ph[_]] +
                       [c_next_target_ph[_]],
                       outputs=c_next_loss[_],
                       updates=[optimize_expr[_]])
        ]
        c_next_values = [
            U.function([obs_ph] + [act_ph] + [c_ph[_]], c_[_])
            for _ in range(num_constraints)
        ]
        g_next_values = [
            U.function([obs_ph], gs_[_]) for _ in range(num_constraints)
        ]
        return train, c_next_values, g_next_values
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            u_func,
            optimizer,
            optimizer_lamda,
            exp_var_alpha=None,
            cvar_alpha=None,
            cvar_beta=None,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            u_estimation=False,
            constrained=True,
            constraint_type=None,
            agent_type=None):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        if constrained:
            lamda_constraint = tf.get_variable(
                'lamda_constraint' + str(q_index), [1],
                initializer=tf.constant_initializer(1.0),
                dtype=tf.float32)
            if constraint_type == "CVAR":
                v_constraint = tf.get_variable(
                    'v_constraint' + str(q_index), [1],
                    initializer=tf.constant_initializer(1.0),
                    dtype=tf.float32)
        # create distribtuions
        act_pdtype_n = make_pdtype(act_space_n[q_index])
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n.sample_placeholder([None], name="action0")]
        target_ph = tf.placeholder(tf.float32, [None], name="target")
        if u_estimation:
            target_ph_u = tf.placeholder(tf.float32, [None], name="target_u")
        rew = tf.placeholder(tf.float32, [None], name="reward")
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[0], act_ph_n[0]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]

        if u_estimation:
            u_input = tf.concat(obs_ph_n + act_ph_n, 1)
            u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0]
            u_loss = tf.reduce_mean(
                tf.square(
                    tf.square(rew) + 2 * tf.multiply(rew, target_ph) +
                    target_ph_u - u))
            var = u - tf.square(q)
        else:
            var = tf.square(rew + target_ph) - tf.square(q)
        if constrained:
            if constraint_type == "Exp_Var":
                #print ('In constraint generation with lamda alpha')
                constraint = lamda_constraint * (var - exp_var_alpha)
                q_loss = tf.reduce_mean(
                    tf.square(q - (target_ph + rew - constraint)))
            elif constraint_type == "CVAR":
                cvar = v_constraint + (1.0 /
                                       (1.0 - cvar_beta)) * tf.reduce_mean(
                                           tf.nn.relu(q - v_constraint))
                constraint = lamda_constraint * (cvar_alpha - cvar)
                q_loss = tf.reduce_mean(
                    tf.square(q - (target_ph + rew - constraint)))
        else:
            q_loss = tf.reduce_mean(tf.square(q - (target_ph + rew)))

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        if u_estimation:
            u_func_vars = U.scope_vars(U.absolute_scope_name("u_func"))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        train3 = None
        if u_estimation:
            loss = q_loss + u_loss  #+ 1e-3 * q_reg
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + u_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [target_ph_u] + [rew],
                               outputs=[q_loss, u_loss],
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [target_ph_u] + [rew],
                                outputs=var)
        elif constraint_type == "CVAR":
            loss = q_loss
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + [v_constraint],
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [rew],
                               outputs=q_loss,
                               updates=[optimize_expr])
            cvar_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                 [rew],
                                 outputs=cvar)
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [rew],
                                outputs=var)
        else:
            #print ('in loss minimization over q_func_vars')
            loss = q_loss
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [rew],
                               outputs=q_loss,
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [rew],
                                outputs=var)
            if not constrained:
                optimize_expr3 = U.minimize_and_clip(optimizer, loss,
                                                     [v_constraint],
                                                     grad_norm_clipping)
                train3 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [rew],
                                    outputs=q_loss,
                                    updates=[optimize_expr3])
        #loss = loss + 1e-4*q_reg
        # Create callable functions

        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        if u_estimation:
            u_values = U.function(obs_ph_n + act_ph_n, u)
            target_u = u_func(u_input,
                              1,
                              scope="target_u_func",
                              num_units=num_units)[:, 0]
            target_u_func_vars = U.scope_vars(
                U.absolute_scope_name("target_u_func"))
            update_target_u = make_update_exp(u_func_vars, target_u_func_vars)
            target_u_values = U.function(obs_ph_n + act_ph_n, target_u)

        if constrained:
            loss2 = -loss
            #print ('in loss maximisation over lamda')
            optimize_expr2 = U.minimize_and_clip(optimizer_lamda, loss2,
                                                 [lamda_constraint],
                                                 grad_norm_clipping)
            if u_estimation:
                train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [target_ph_u] + [rew],
                                    outputs=loss2,
                                    updates=[optimize_expr2])
            else:
                train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [rew],
                                    outputs=loss2,
                                    updates=[optimize_expr2])

        if not u_estimation:
            update_target_u = None
            target_u_values = None
            u_values = None
        if not constrained:
            train2 = None
            lamda_constraint = None
        if constraint_type != "CVAR":
            cvar_fn = None
            v_constraint = None
        return train, train2, train3, update_target_q, update_target_u, {
            'q_values': q_values,
            'u_values': u_values,
            'target_q_values': target_q_values,
            'target_u_values': target_u_values,
            'var': var_fn,
            'cvar': cvar_fn,
            'lamda_constraint': lamda_constraint,
            'v_constraint': v_constraint,
            'optimize_expr': optimize_expr
        }
Beispiel #26
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            adversarial,
            adv_eps,
            adv_eps_s,
            num_adversaries,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        if adversarial:
            num_agents = len(act_input_n)
            if p_index < num_adversaries:
                adv_rate = [
                    adv_eps_s * (i < num_adversaries) + adv_eps *
                    (i >= num_adversaries) for i in range(num_agents)
                ]
            else:
                adv_rate = [
                    adv_eps_s * (i >= num_adversaries) + adv_eps *
                    (i < num_adversaries) for i in range(num_agents)
                ]
            print("      adv rate for p_index : ", p_index, adv_rate)
            raw_perturb = tf.gradients(pg_loss, act_input_n)
            perturb = [
                tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1))
                for elem in raw_perturb
            ]
            perturb = [perturb[i] * adv_rate[i] for i in range(num_agents)]
            new_act_n = [
                perturb[i] + act_input_n[i] if i != p_index else act_input_n[i]
                for i in range(len(act_input_n))
            ]

            adv_q_input = tf.concat(obs_ph_n + new_act_n, 1)
            adv_q = q_func(adv_q_input,
                           1,
                           scope="q_func",
                           reuse=True,
                           num_units=num_units)[:, 0]
            pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        # make_ob_ph_n是输入的placeholder,与obs_n同shape
        act_pdtype_n = [make_pdtype(act_space)
                        for act_space in act_space_n]  # 获取概率类型,传入动作维度(5)
        # act_space来自于env.act_space,由实验环境决定
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None],
                                   name="target")  # 一维输入占位符
        # 以上为三个placeholder, [None]增加维度,不知道喂进去多少数据时使用, 即None是batchsize大小

        q_input = tf.concat(obs_ph_n + act_ph_n,
                            1)  # q函数输入网络为动作加上环境,在1维上,即q网络输入是所有agent观察和动作
        if local_q_func:  # 用ddpg时即只用自己的行为训练
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func",
                   num_units=num_units)[:, 0]  # 取所有行的第0个数据
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        # q网络变量集合
        q_loss = tf.reduce_mean(
            tf.square(q - target_ph))  # target_ph 会被什么占据呢? 会被喂进去的td target占据
        # q网络的损失函数,均方差,target_ph来自于target网络的预测
        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  # + 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)
        # 优化器表达式,以及是否梯度clip
        # Create callable functions
        # theano function
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        # 以下返回值均为theano function可以直接填入传入placeholder的参数
        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):  # 重用变量
        # create distribtuions初始动作概率分布列表
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n
                        ]  # 为所有agent的动作空间都创造一个动作概率分布类
        # 类的集合
        # set up placeholders
        obs_ph_n = make_obs_ph_n  # 所有的agent观察到的环境信息
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # 返回用于存放每个agent的动作的占位符集合,用于填充所有agent选择的动作[none]代表可以填入无数组数据
        p_input = obs_ph_n[p_index]  # 仅观察到自身周围环境

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        # 建立神经网络,输出单元数为动作个数...这代码写的太呆了 输出每一个动作的值
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # 获取该神经网络全部变量
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_sample = act_pd.sample()  # 确定性动作叠加噪声进行探索,成为随机策略,得到一组act,作用未知
        p_reg = tf.reduce_mean(tf.square(
            act_pd.flatparam()))  # flatparam是所有动作的actor网络输出值的集合
        # 猜测引入p_reg是因为预测其agent动作的需要
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample(
        )  # 仅替换自己的动作输入,自己的动作来自于自己的policy网络输出
        # 所以通过这一步将两个网络连接,通过q网络优化自己的policy网络
        q_input = tf.concat(obs_ph_n + act_input_n,
                            1)  # q输入所有的环境观察值与所有的agents采取的动作
        # q的输入
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        # 这里是用的q_func由于reuse所以使用已经创建好的变量,即自己的q网络而不是再创建一个
        # q_train,p_train属于同一个scope!
        # 策略优化目标
        pg_loss = -tf.reduce_mean(q)  # loss与p_reg均需要加-号进行优化
        # 目标使q的均值最大,等于采样后的-reduce_mean最小
        loss = pg_loss + p_reg * 1e-3  # 引入熵?
        # 梯度下降优化器节点表达式
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions可调用函数,批量使用session训练
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]],
                         outputs=act_sample)  # 依据自身观察给出确定性动作
        p_values = U.function([obs_ph_n[p_index]], p)  # 输出的是动作值集合

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Beispiel #29
0
def p_train(make_obs_ph_n,
            act_space_n,
            make_obs_history_n,
            make_act_history_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    """
    Policy learning guided by Q-value

    Args:
        make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents
        act_space_n (list): A list of the action spaces for all agents
        make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents
        make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents
        p_index (int): Agent index number
        p_func (function): MLP Neural Network model for the agent.
        q_func (function): MLP Neural Network model for the agent.
        optimizer (function): Network Optimizer function
        grad_norm_clipping (float): Value by which to clip the norm of the gradient
        local_q_func (boolean): Flag for using local q function
        num_units (int): The number outputs for the layers of the model
        scope (str): The name of the scope
        reuse (boolean): Flag specifying whether to reuse the scope

    Returns:
        act (function): Action function for retrieving agent action.
        train (function): Training function for P network
        update_target_p (function): Update function for updating P network values
        p_debug (dict): Contains 'p_values' and 'target_act' of the P network
    """
    with tf.variable_scope(scope, reuse=reuse):
        # Create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # Set up placeholders
        obs_ph_n = make_obs_ph_n
        obs_history_n = make_obs_history_n
        act_history_n = make_act_history_n

        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        ccm_ph_n = [
            tf.placeholder(tf.float32, [None], name="ccm" + str(p_index))
        ]
        ccm_lambda = [
            tf.placeholder(tf.float32, [None], name="lambda" + str(p_index))
        ]
        ccm_switch = [
            tf.placeholder(tf.float32, [None], name="switch" + str(p_index))
        ]

        # Original implementation
        # p_input = obs_ph_n[p_index]

        # Modified
        p_input = tf.concat([obs_ph_n[p_index], obs_history_n[p_index]], 1)

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("p_func"))

        # Wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        # Original implementation
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        # Modified
        # act_input_n = act_hist_ph_n + []
        # act_input_n[p_index] = act_pd.mode()

        # Original implementation
        # q_input = tf.concat(obs_ph_n + act_input_n, 1)

        # Modified
        # Current plus previous time-steps
        q_input = tf.concat(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1)

        if local_q_func:
            # Only have observations about myself when 'ddpg'
            # Importantly... [my.x, my.y, my.dx, my.dy, r1.x]
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)

        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]

        # This is probably because of DDPG, rather than DSPG
        # pg_loss = -tf.reduce_mean(q)
        # ************************************************************************************************
        # ccm_input = something
        # ccm_value = ccm_func(ccm_input)

        pg_loss = -(1 - ccm_lambda[0]) * tf.reduce_mean(q) * (1 - ccm_switch[0]) - \
                  ccm_lambda[0] * ccm_ph_n[0] * ccm_switch[0]

        # ************************************************************************************************

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = tf_util.minimize_and_clip(optimizer, loss, p_func_vars,
                                                  grad_norm_clipping)

        # Create callable functions
        train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n +
                                 act_history_n + ccm_ph_n + ccm_lambda +
                                 ccm_switch,
                                 outputs=loss,
                                 updates=[optimize_expr])

        # Original implementation
        # act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        # p_values = tf_util.function([obs_ph_n[p_index]], p)

        # Modified
        act = tf_util.function(inputs=[obs_ph_n[p_index]] +
                               [obs_history_n[p_index]],
                               outputs=act_sample)
        p_values = tf_util.function(inputs=[obs_ph_n[p_index]] +
                                    [obs_history_n[p_index]],
                                    outputs=p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = tf_util.scope_vars(
            tf_util.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()

        # Original implementation
        # target_act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        # Modified
        target_act = tf_util.function(inputs=[obs_ph_n[p_index]] +
                                      [obs_history_n[p_index]],
                                      outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Beispiel #30
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            adversarial,
            adv_eps,
            adv_eps_s,
            num_adversaries,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]

        if adversarial:
            num_agents = len(act_ph_n)
            if q_index < num_adversaries:
                adv_rate = [
                    adv_eps_s * (i < num_adversaries) + adv_eps *
                    (i >= num_adversaries) for i in range(num_agents)
                ]
            else:
                adv_rate = [
                    adv_eps_s * (i >= num_adversaries) + adv_eps *
                    (i < num_adversaries) for i in range(num_agents)
                ]
            print("      adv rate for q_index : ", q_index, adv_rate)

            pg_loss = -tf.reduce_mean(target_q)
            raw_perturb = tf.gradients(pg_loss, act_ph_n)
            perturb = [
                adv_eps * tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1))
                for elem in raw_perturb
            ]
            new_act_n = [
                perturb[i] + act_ph_n[i] if i != q_index else act_ph_n[i]
                for i in range(len(act_ph_n))
            ]
            adv_q_input = tf.concat(obs_ph_n + new_act_n, 1)
            target_q = q_func(adv_q_input,
                              1,
                              scope='target_q_func',
                              reuse=True,
                              num_units=num_units)[:, 0]

        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }