Ejemplo n.º 1
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Ejemplo n.º 2
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n

        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        '''
        print(q_input.get_shape())

        print('obs_ph_n')
        print(obs_ph_n[7].get_shape())
        print('act_input_n')
        print(act_input_n[4].get_shape())
        '''
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
            print(tf.shape(q_input))
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Ejemplo n.º 3
0
def group_p_train(make_obs_ph_n,
                  act_space_n,
                  p_index,
                  num_adversaries,
                  p_func,
                  q_func,
                  optimizer,
                  grad_norm_clipping=None,
                  local_q_func=False,
                  num_units=64,
                  scope="trainer",
                  reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders for a group
        obs_ph_n = make_obs_ph_n
        n_agents = len(obs_ph_n)

        if (p_index < num_adversaries):
            act_ph_ns = [[
                act_pdtype_n[i].sample_placeholder([None],
                                                   name="action" + str(n) +
                                                   '_' + str(i))
                for i in range(len(act_space_n))
            ] for n in range(num_adversaries)]
        else:
            act_ph_ns = [[
                act_pdtype_n[i].sample_placeholder([None],
                                                   name="action" + str(n) +
                                                   '_' + str(i))
                for i in range(len(act_space_n))
            ] for n in range(n_agents - num_adversaries)]
        act_ph_ns_flatten = list(chain.from_iterable(act_ph_ns))

        # p_input = obs_ph_n[p_index] # one obs for a certain p_index
        # batchify obs for all agents in a group
        if (p_index < num_adversaries):  # adv
            p_input = tf.concat(obs_ph_n[:num_adversaries], 1)
            p_input = tf.reshape(
                p_input, [-1, p_input.shape[-1].value // num_adversaries])
        else:  # good agent
            p_input = tf.concat(obs_ph_n[num_adversaries:], 1)
            p_input = tf.reshape(
                p_input,
                [-1, p_input.shape[-1].value // (n_agents - num_adversaries)])

        # get all actions from a group
        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        # un-batchify actions from a group
        if (p_index < num_adversaries):
            p = tf.reshape(p, [-1, p.shape[-1] * num_adversaries])
            ps = tf.split(p, num_or_size_splits=num_adversaries, axis=1)
        else:
            p = tf.reshape(p, [-1, p.shape[-1] * (n_agents - num_adversaries)])
            ps = tf.split(p,
                          num_or_size_splits=(n_agents - num_adversaries),
                          axis=1)

        # get probability distributions and action samples for a group
        if (p_index < num_adversaries):
            act_pds = [
                act_pdtype_n[i].pdfromflat(ps[i])
                for i in range(num_adversaries)
            ]
            act_samples = [act_pds[i].sample() for i in range(num_adversaries)]
        else:
            act_pds = [
                act_pdtype_n[i].pdfromflat(ps[i - num_adversaries])
                for i in range(num_adversaries, n_agents)
            ]
            act_samples = [
                act_pds[i].sample() for i in range(n_agents - num_adversaries)
            ]
        # act_pd = act_pdtype_n[p_index].pdfromflat(p)
        # act_sample = act_pd.sample()

        # p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
        # get average p_reg for a group
        p_reg = tf.reduce_mean(
            tf.square(tf.concat([act_pd.flatparam() for act_pd in act_pds],
                                -1)))

        # act_input_n = act_ph_n + []
        act_input_ns = act_ph_ns
        # act_input_n[p_index] = act_pd.sample()
        # q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if (p_index < num_adversaries):
            q_inputs = []
            for i in range(num_adversaries):
                act_input_ns[i][i] = act_pds[i].sample()
                q_inputs.append(tf.concat(obs_ph_n + act_input_ns[i], 1))
            # batchify q_input
            q_input = tf.concat(q_inputs, 0)
        else:
            q_inputs = []
            for i in range(n_agents - num_adversaries):
                act_input_ns[i][i + num_adversaries] = act_pds[i].sample()
                q_inputs.append(tf.concat(obs_ph_n + act_input_ns[i], 1))
            # batchify q_input
            q_input = tf.concat(q_inputs, 0)

        # if local_q_func:
        #     q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)

        # input group of q_input into q_func
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_ns_flatten,
                           outputs=loss,
                           updates=[optimize_expr])
        if (p_index < num_adversaries):
            print([obs_ph_n[p_index]], act_samples[p_index])
            act = U.function(inputs=[obs_ph_n[p_index]],
                             outputs=act_samples[p_index])
            p_values = U.function([obs_ph_n[p_index]], ps[p_index])
        else:
            print([obs_ph_n[p_index]], act_samples[p_index - num_adversaries])
            act = U.function(inputs=[obs_ph_n[p_index]],
                             outputs=act_samples[p_index - num_adversaries])
            p_values = U.function([obs_ph_n[p_index]],
                                  ps[p_index - num_adversaries])

        # target network for a group
        if (p_index < num_adversaries):
            p_input = tf.reshape(p_input,
                                 [-1, p_input.shape[-1] * num_adversaries])
            p_inputs = tf.split(p_input,
                                num_or_size_splits=num_adversaries,
                                axis=1)
            target_p = p_func(p_inputs[p_index],
                              int(act_pdtype_n[p_index].param_shape()[0]),
                              scope="target_p_func",
                              num_units=num_units)
        else:
            p_input = tf.reshape(
                p_input,
                [-1, p_input.shape[-1] * (n_agents - num_adversaries)])
            p_inputs = tf.split(p_input,
                                num_or_size_splits=(n_agents -
                                                    num_adversaries),
                                axis=1)
            target_p = p_func(p_inputs[p_index - num_adversaries],
                              int(act_pdtype_n[p_index].param_shape()[0]),
                              scope="target_p_func",
                              num_units=num_units)
        # target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }