Ejemplo n.º 1
0
def test_lio(config):

    seed = config.main.seed
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    dir_name = config.main.dir_name
    exp_name = config.main.exp_name
    log_path = os.path.join('..', 'results', exp_name, dir_name)
    model_name = config.main.model_name

    n_test = config.alg.n_test

    env = room_symmetric.Env(config.env)

    if config.lio.use_actor_critic:
        from lio_ac import LIO
    else:
        from lio_agent import LIO

    list_agents = []
    for agent_id in range(env.n_agents):
        list_agents.append(
            LIO(config.lio, env.l_obs, env.l_action, config.nn,
                'agent_%d' % agent_id, config.env.r_multiplier, env.n_agents,
                agent_id))

    for agent_id in range(env.n_agents):
        list_agents[agent_id].receive_list_of_agents(list_agents)
        list_agents[agent_id].create_policy_gradient_op()
        list_agents[agent_id].create_update_op()

    for agent_id in range(env.n_agents):
        list_agents[agent_id].create_reward_train_op()

    if config.lio.asymmetric:
        assert config.env.n_agents == 2
        for agent_id in range(env.n_agents):
            list_agents[agent_id].set_can_give(
                agent_id != config.lio.idx_recipient)

    config_proto = tf.ConfigProto()
    if config.main.use_gpu:
        config_proto.device_count['GPU'] = 1
        config_proto.gpu_options.allow_growth = True
    else:
        config_proto.device_count['GPU'] = 0
    sess = tf.Session(config=config_proto)

    saver = tf.train.Saver()
    print("Restoring variables from %s" % dir_name)
    saver.restore(sess, os.path.join(log_path, model_name))

    _ = evaluate.test_room_symmetric(n_test,
                                     env,
                                     sess,
                                     list_agents,
                                     log=True,
                                     log_path=log_path)
Ejemplo n.º 2
0
def train(config):

    # set random seed
    seed = 1234
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

    results_one = []
    results_two = []
    reward_one_to_two = []
    reward_two_to_one = []

    # init game env
    env = room_symmetric.Env(config.env)

    agents = []
    for i in range(env.n_agents):
        agents.append(Actor(i, 7, env.n_agents))

    # epoch start
    for epoch in range(5000):
        if (epoch + 1) % 500 == 0:
            print("Epoch: ", epoch + 1, "/5000")
        """
        The first trajectory generation
        """
        trajs = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = None

        while not done:
            if list_obs_next is not None:
                list_obs = list_obs_next

            # decide actions from observations
            list_act, list_act_hot = action_sampling(agents, list_obs)

            # give incentivisation
            inctv_to, inctv_from = give_incentivisation(agents, list_act_hot, config)
            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, inctv_to)

            # save trajectory
            for agent in agents:
                trajs[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id],
                                    inctv_from[agent.id])

        for agent in agents:
            agent.update_policy(trajs)

        """
        The second trajectory generation
        """
        # Generate a new trajectory
        trajs_new = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = False
        result_one_new = 0
        result_two_new = 0

        while not done:
            if list_obs_next is not None:
                list_obs = list_obs_next
            # decide actions from observations
            list_act, list_act_hot = action_sampling(agents, list_obs)

            # give incentivisation
            inctv_to, new_inctv_from_others = give_incentivisation(agents, list_act_hot, config)

            reward_one_to_two.append(inctv_to[0])
            reward_two_to_one.append(inctv_to[1])

            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, inctv_to)

            for agent in agents:
                trajs_new[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id],
                                        new_inctv_from_others[agent.id])

            result_one_new += env_rewards[0]
            result_two_new += env_rewards[1]

            if done:
                results_one.append(result_one_new)
                results_two.append(result_two_new)

        # compute new log prob act
        log_prob_act_other = [[] for _ in range(config.env.n_agents)]
        for agent in agents:
            states_new = [trajectory.get_state() for trajectory in trajs_new]
            actions_new = [trajectory.get_action() for trajectory in trajs_new]
            logits, _ = agent.policy_net(states_new[agent.id], agent.new_params)
            # grad_graph(logits, 'logits')
            log_prob = F.log_softmax(logits, dim=-1)
            log_prob_act = torch.stack([log_prob[i][actions_new[agent.id][i]]
                                        for i in range(len(actions_new[agent.id]))],
                                       dim=0)
            log_prob_act_other[agent.id] = log_prob_act

        # optimizer.zero_grad()
        # loss_p = [torch.Tensor() for _ in range(2)]
        # for agent in agents:
        #     loss_p[agent.id] = agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other)
        # loss = loss_p[0] + loss_p[1]
        # loss.backward()
        # optimizer.step()
        for agent in agents:
            agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other)

        for agent in agents:
            agent.update_to_new_params()
    return results_one, results_two, reward_one_to_two, reward_two_to_one
Ejemplo n.º 3
0
def train(config):

    seed = config.main.seed
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    dir_name = config.main.dir_name
    exp_name = config.main.exp_name
    log_path = os.path.join('..', 'results', exp_name, dir_name)
    model_name = config.main.model_name
    save_period = config.main.save_period

    os.makedirs(log_path, exist_ok=True)

    # Keep a record of parameters used for this run
    with open(os.path.join(log_path, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4, sort_keys=True)

    n_episodes = int(config.alg.n_episodes)
    n_eval = config.alg.n_eval
    period = config.alg.period

    epsilon = config.lio.epsilon_start
    epsilon_step = (epsilon - config.lio.epsilon_end) / config.lio.epsilon_div

    if config.env.name == 'er':
        env = room_symmetric.Env(config.env)
    elif config.env.name == 'ipd':
        env = ipd_wrapper.IPD(config.env)

    if config.lio.decentralized:
        from lio_decentralized import LIO
    elif config.lio.use_actor_critic:
        from lio_ac import LIO
    else:
        from lio_agent import LIO

    list_agents = []
    for agent_id in range(env.n_agents):
        if config.lio.decentralized:
            list_agent_id_opp = list(range(env.n_agents))
            del list_agent_id_opp[agent_id]
            list_agents.append(
                LIO(config.lio, env.l_obs, env.l_action, config.nn,
                    'agent_%d' % agent_id, config.env.r_multiplier,
                    env.n_agents, agent_id, list_agent_id_opp))
        else:
            list_agents.append(
                LIO(config.lio, env.l_obs, env.l_action, config.nn,
                    'agent_%d' % agent_id, config.env.r_multiplier,
                    env.n_agents, agent_id))

    for agent_id in range(env.n_agents):
        if config.lio.decentralized:
            list_agents[agent_id].create_opp_modeling_op()
        else:
            list_agents[agent_id].receive_list_of_agents(list_agents)
        list_agents[agent_id].create_policy_gradient_op()
        list_agents[agent_id].create_update_op()
        if config.lio.use_actor_critic:
            list_agents[agent_id].create_critic_train_op()

    for agent_id in range(env.n_agents):
        list_agents[agent_id].create_reward_train_op()

    # This handles the special case of two asymmetric agents,
    # one of which is the reward-giver and the other is the recipient
    if config.lio.asymmetric:
        assert config.env.n_agents == 2
        for agent_id in range(env.n_agents):
            list_agents[agent_id].set_can_give(
                agent_id != config.lio.idx_recipient)

    config_proto = tf.ConfigProto()
    if config.main.use_gpu:
        config_proto.device_count['GPU'] = 1
        config_proto.gpu_options.allow_growth = True
    else:
        config_proto.device_count['GPU'] = 0
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())

    if config.lio.use_actor_critic:
        for agent in list_agents:
            sess.run(agent.list_initialize_v_ops)

    list_agent_meas = []
    if config.env.name == 'er':
        list_suffix = [
            'reward_total', 'n_lever', 'n_door', 'received', 'given',
            'r-lever', 'r-start', 'r-door'
        ]
    elif config.env.name == 'ipd':
        list_suffix = ['given', 'received', 'reward_env', 'reward_total']
    for agent_id in range(1, env.n_agents + 1):
        for suffix in list_suffix:
            list_agent_meas.append('A%d_%s' % (agent_id, suffix))

    saver = tf.train.Saver(max_to_keep=config.main.max_to_keep)

    header = 'episode,step_train,step,'
    header += ','.join(list_agent_meas)
    if config.env.name == 'er':
        header += ',steps_per_eps\n'
    else:
        header += '\n'
    with open(os.path.join(log_path, 'log.csv'), 'w') as f:
        f.write(header)

    step = 0
    step_train = 0
    for idx_episode in range(1, n_episodes + 1):

        list_buffers = run_episode(sess,
                                   env,
                                   list_agents,
                                   epsilon,
                                   prime=False)
        step += len(list_buffers[0].obs)

        if config.lio.decentralized:
            for idx, agent in enumerate(list_agents):
                agent.train_opp_model(sess, list_buffers, epsilon)

        for idx, agent in enumerate(list_agents):
            agent.update(sess, list_buffers[idx], epsilon)

        list_buffers_new = run_episode(sess,
                                       env,
                                       list_agents,
                                       epsilon,
                                       prime=True)
        step += len(list_buffers_new[0].obs)

        for agent in list_agents:
            if agent.can_give:
                agent.train_reward(sess, list_buffers, list_buffers_new,
                                   epsilon)

        for idx, agent in enumerate(list_agents):
            if config.lio.decentralized:
                agent.train_opp_model(sess, list_buffers_new, epsilon)
            else:
                agent.update_main(sess)

        step_train += 1

        if idx_episode % period == 0:

            if config.env.name == 'er':
                (reward_total, n_move_lever, n_move_door, rewards_received,
                 rewards_given, steps_per_episode, r_lever, r_start,
                 r_door) = evaluate.test_room_symmetric(
                     n_eval, env, sess, list_agents)
                matrix_combined = np.stack([
                    reward_total, n_move_lever, n_move_door, rewards_received,
                    rewards_given, r_lever, r_start, r_door
                ])
            elif config.env.name == 'ipd':
                given, received, reward_env, reward_total = evaluate.test_ipd(
                    n_eval, env, sess, list_agents)
                matrix_combined = np.stack(
                    [given, received, reward_env, reward_total])

            s = '%d,%d,%d' % (idx_episode, step_train, step)
            for idx in range(env.n_agents):
                s += ','
                if config.env.name == 'er':
                    s += ('{:.3e},{:.3e},{:.3e},{:.3e},{:.3e},'
                          '{:.3e},{:.3e},{:.3e}').format(*matrix_combined[:,
                                                                          idx])
                elif config.env.name == 'ipd':
                    s += '{:.3e},{:.3e},{:.3e},{:.3e}'.format(
                        *matrix_combined[:, idx])
            if config.env.name == 'er':
                s += ',%.2f\n' % steps_per_episode
            else:
                s += '\n'
            with open(os.path.join(log_path, 'log.csv'), 'a') as f:
                f.write(s)

        if idx_episode % save_period == 0:
            saver.save(
                sess,
                os.path.join(log_path, '%s.%d' % (model_name, idx_episode)))

        if epsilon > config.lio.epsilon_end:
            epsilon -= epsilon_step

    saver.save(sess, os.path.join(log_path, model_name))
Ejemplo n.º 4
0
def train(config):

    # set seeds for the training
    # seed = config.main.seed
    seed = 12345
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

    n_episodes = int(config.alg.n_episodes)
    n_eval = config.alg.n_eval
    period = config.alg.period

    results_one = []
    results_two = []
    reward_one_to_two = []
    reward_two_to_one = []

    # 初始化环境
    env = room_symmetric.Env(config.env)

    agents = []
    for i in range(config.env.n_agents):
        agents.append(Actor(i, 7, config.env.n_agents))

    # epoch start
    for epoch in range(6000):
        trajs = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = False
        result_one = 0
        result_two = 0

        while not done:
            list_act = []
            list_act_hot = []
            if list_obs_next is not None:
                list_obs = list_obs_next
            # set observations and decide actions
            for agent in agents:
                agent.set_obs(list_obs[agent.id])
                agent.action_sampling()
                list_act_hot.append(agent.get_action_hot())
                list_act.append(agent.get_action())

            list_rewards = []
            total_reward_given_to_each_agent = np.zeros(env.n_agents)

            # give rewards
            for agent in agents:
                reward = agent.give_reward(list_act_hot)
                reward[agent.id] = 0
                total_reward_given_to_each_agent += reward
                reward = np.delete(reward, agent.id)
                list_rewards.append(reward)

            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, list_rewards)

            for agent in agents:
                reward_given = total_reward_given_to_each_agent[agent.id]
                trajs[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], reward_given)

            result_one += env_rewards[0]
            result_two += env_rewards[1]

            # if done:
                # results_one.append(result_one)
                # results_two.append(result_two)

        for agent in agents:
            agent.update_policy(trajs[agent.id])

        # Generate a new trajectory
        trajs_new = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = False
        result_one_new = 0
        result_two_new = 0

        while not done:
            list_act = []
            list_act_hot = []
            if list_obs_next is not None:
                list_obs = list_obs_next
            # set observations and decide actions
            for agent in agents:
                agent.set_obs(list_obs[agent.id])
                agent.action_sampling()
                list_act_hot.append(agent.get_action_hot())
                list_act.append(agent.get_action())

            list_rewards = []
            total_reward_given_to_each_agent = np.zeros(env.n_agents)

            # give rewards
            for agent in agents:
                reward = agent.give_reward(list_act_hot)
                reward[agent.id] = 0
                total_reward_given_to_each_agent += reward
                reward = np.delete(reward, agent.id)
                list_rewards.append(reward)
                if agent.id == 0:
                    reward_one_to_two.append(reward)
                else:
                    reward_two_to_one.append(reward)


            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, list_rewards)

            for agent in agents:
                reward_given = total_reward_given_to_each_agent[agent.id]
                trajs_new[agent.id].add(agent.get_obs(), agent.get_action(), agent.get_action_hot(), env_rewards[agent.id], reward_given)

            result_one_new += env_rewards[0]
            result_two_new += env_rewards[1]

            if done:
                results_one.append(result_one_new)
                results_two.append(result_two_new)

        for agent in agents:
            agent.update_rewards_giving(trajs, trajs_new)

    return results_one, results_two, reward_one_to_two, reward_two_to_one
Ejemplo n.º 5
0
def train(config):

    # set seeds for the train
    seed = config.main.seed
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    # namespace
    dir_name = config.main.dir_name
    exp_name = config.main.exp_name
    log_path = os.path.join('..', 'results', exp_name, dir_name)
    model_name = config.main.model_name
    save_period = config.main.save_period

    # create folder for results
    os.makedirs(log_path, exist_ok=True)

    # record parameter of this run
    with open(os.path.join(log_path, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4, sort_keys=True)

    # set hyper-parameters
    n_episodes = int(config.alg.n_episodes)
    n_eval = config.alg.n_eval
    period = config.alg.period

    # parameters for epsilon-greedy method
    epsilon = config.lio.epsilon_start
    epsilon_step = (epsilon - config.lio.epsilon_end) / config.lio.epsilon_div

    # make env
    env = room_symmetric.Env(config.env)

    from lio_agent import LIO

    # init lio agents
    list_agents = []
    for agent_id in range(env.n_agents):
        list_agents.append(
            LIO(config.lio, env.l_obs, env.l_action, config.nn,
                'agent_%d' % agent_id, config.env.r_multiplier, env.n_agents,
                agent_id))

    # init training optimizers
    for agent_id in range(env.n_agents):
        list_agents[agent_id].receive_list_of_agents(list_agents)
        list_agents[agent_id].create_policy_gradient_op()
        list_agents[agent_id].create_update_op()

    for agent_id in range(env.n_agents):
        list_agents[agent_id].create_reward_train_op()

    # This handles the special case of two asymmetric agents,
    # one of which is the reward-giver and the other is the recipient
    if config.lio.asymmetric:
        assert config.env.n_agents == 2
        for agent_id in range(env.n_agents):
            list_agents[agent_id].set_can_give(
                agent_id != config.lio.idx_recipient)

    config_proto = tf.ConfigProto()
    if config.main.use_gpu:
        config_proto.device_count['GPU'] = 1
        config_proto.gpu_options.allow_growth = True
    else:
        config_proto.device_count['GPU'] = 0
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())

    #
    list_agent_meas = []
    list_suffix = [
        'reward_total', 'n_lever', 'n_door', 'received', 'given', 'r-lever',
        'r-start', 'r-door'
    ]
    for agent_id in range(1, env.n_agents + 1):
        for suffix in list_suffix:
            list_agent_meas.append('A%d_%s' % (agent_id, suffix))

    # save models
    saver = tf.train.Saver(max_to_keep=config.main.max_to_keep)

    header = 'spisode, step_train, step,'
    header += ','.join(list_agent_meas)
    header += ',step_per_eps\n'

    with open(os.path.join(log_path, 'log.csv'), 'w') as f:
        f.write(header)

    # episode start
    step = 0
    step_train = 0
    for idx_episode in range(1, n_episodes + 1):

        # generate a trajectory
        list_buffers = run_episode(sess,
                                   env,
                                   list_agents,
                                   epsilon,
                                   prime=False)
        step += len(list_buffers[0].obs)

        for idx, agent in enumerate(list_agents):
            agent.update(sess, list_buffers[idx], epsilon)

        # generate new trajectory with new parameters of incentive function
        list_buffers_new = run_episode(sess,
                                       env,
                                       list_agents,
                                       epsilon,
                                       prime=True)
        step += len(list_buffers_new[0].obs)

        # train incentive function
        for agent in list_agents:
            if agent.can_give:
                agent.train_reward(sess, list_buffers, list_buffers_new,
                                   epsilon)

        for idx, agent in enumerate(list_agents):
            agent.update_main(sess)

        step_train += 1

        # add results to the result file
        if idx_episode % period == 0:
            (reward_total, n_move_lever, n_move_door, rewards_received,
             rewards_given, steps_per_episode, r_lever, r_start,
             r_door) = evaluate.test_room_symmetric(n_eval, env, sess,
                                                    list_agents)
            matrix_combined = np.stack([
                reward_total, n_move_lever, n_move_door, rewards_received,
                rewards_given, r_lever, r_start, r_door
            ])

            s = '%d,%d,%d' % (idx_episode, step_train, step)
            for idx in range(env.n_agents):
                s += ','
                s += ('{:.3e},{:.3e},{:.3e},{:.3e},{:.3e},'
                      '{:.3e},{:.3e},{:.3e}').format(*matrix_combined[:, idx])
            s += ',%.2f\n' % steps_per_episode
            with open(os.path.join(log_path, 'log.csv'), 'a') as f:
                f.write(s)

        if idx_episode % save_period == 0:
            saver.save(
                sess,
                os.path.join(log_path, '%s.%d' % (model_name, idx_episode)))

        if epsilon > config.lio.epsilon_end:
            epsilon -= epsilon_step

    saver.save(sess, os.path.join(log_path, model_name))
Ejemplo n.º 6
0
def train(config):

    # set seeds for the training
    # seed = config.main.seed
    seed = 1234
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

    n_episodes = int(config.alg.n_episodes)
    n_eval = config.alg.n_eval
    period = config.alg.period

    results_one = []
    results_two = []
    reward_one_to_two = []
    reward_two_to_one = []

    # 初始化环境
    env = room_symmetric.Env(config.env)

    agents = []
    for i in range(config.env.n_agents):
        agents.append(Actor(i, 7, config.env.n_agents))

    # epoch start
    for epoch in range(5000):
        trajs = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = False
        result_one = 0
        result_two = 0

        while not done:
            list_act = []
            list_act_hot = []
            if list_obs_next is not None:
                list_obs = list_obs_next
            # set observations and decide actions
            for agent in agents:
                agent.set_obs(list_obs[agent.id])
                agent.action_sampling()
                list_act_hot.append(agent.get_action_hot())
                list_act.append(agent.get_action())

            list_rewards = []
            total_reward_given_to_each_agent = torch.zeros(env.n_agents)
            reward = [None for _ in range(env.n_agents)]

            # give rewards
            for agent in agents:
                reward[agent.id] = agent.give_reward(list_act_hot)
                for idx in range(env.n_agents):
                    if idx != agent.id:
                        total_reward_given_to_each_agent[idx] += reward[
                            agent.id][idx]  # 各个智能体受到的激励
                reward_sum = (reward[agent.id].sum() -
                              reward[agent.id][agent.id]
                              ).detach().numpy()  # 计算自己在这一步给予别人的总激励
                list_rewards.append(reward_sum)

            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, list_rewards)

            for agent in agents:
                reward_given = total_reward_given_to_each_agent[agent.id]
                trajs[agent.id].add(agent.get_obs(), agent.get_action(),
                                    agent.get_action_hot(),
                                    env_rewards[agent.id], reward_given)

            result_one += env_rewards[0]
            result_two += env_rewards[1]

        for agent in agents:
            agent.update_policy(trajs[agent.id])

        # Generate a new trajectory
        trajs_new = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = False
        result_one_new = 0
        result_two_new = 0

        while not done:
            list_act = []
            list_act_hot = []
            if list_obs_next is not None:
                list_obs = list_obs_next
            # set observations and decide actions
            for agent in agents:
                agent.set_obs(list_obs[agent.id])
                agent.action_sampling(agent.new_params)
                list_act_hot.append(agent.get_action_hot())
                list_act.append(agent.get_action())

            list_rewards = []
            total_reward_given_to_each_agent = torch.zeros(env.n_agents)
            reward_new = [None for _ in range(env.n_agents)]

            # give rewards
            for agent in agents:
                reward_new[agent.id] = agent.give_reward(list_act_hot)
                reward_sum = torch.zeros(1)

                for idx in range(env.n_agents):
                    if idx != agent.id:
                        total_reward_given_to_each_agent[idx] += reward_new[
                            agent.id][idx]
                        reward_sum += reward_new[agent.id][
                            idx]  # 计算自己总共给予了多少报酬
                reward_sum = (reward_new[agent.id].sum() -
                              reward_new[agent.id][agent.id]).detach().numpy()
                list_rewards.append(reward_sum)

                if agent.id == 0:
                    reward_one_to_two.append(reward_sum)
                else:
                    reward_two_to_one.append(reward_sum)

            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, list_rewards)

            for agent in agents:
                reward_given = total_reward_given_to_each_agent[agent.id]
                trajs_new[agent.id].add(agent.get_obs(), agent.get_action(),
                                        agent.get_action_hot(),
                                        env_rewards[agent.id], reward_given)

            result_one_new += env_rewards[0]
            result_two_new += env_rewards[1]

            if done:
                results_one.append(result_one_new)
                results_two.append(result_two_new)

        # compute new log prob act
        log_prob_act_other = [[] for _ in range(config.env.n_agents)]
        for agent in agents:
            states_new = [trajectory.get_state() for trajectory in trajs_new]
            actions_new = [trajectory.get_action() for trajectory in trajs_new]
            logits, _ = agent.policy_net(states_new[agent.id],
                                         agent.new_params)
            # grad_graph(logits, 'logits')
            log_prob = F.log_softmax(logits, dim=-1)
            log_prob_act = torch.stack([
                log_prob[i][actions_new[agent.id][i]]
                for i in range(len(actions_new[agent.id]))
            ],
                                       dim=0)
            log_prob_act_other[agent.id] = log_prob_act

        for agent in agents:
            agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other)

        for agent in agents:
            agent.update_to_new_params()

    return results_one, results_two, reward_one_to_two, reward_two_to_one
Ejemplo n.º 7
0
def train(config):

    # set random seed
    seed = 1234
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

    num_epoch = 5000
    lr_a = 0.01

    epsilon = 0.5
    epsilon_end = 0.1
    epsilon_div = 1e3
    epsilon_step = ((epsilon - epsilon_end) / epsilon_div)

    results_one = []
    results_two = []
    reward_one_to_two = []
    reward_two_to_one = []

    # init game env
    env = room_symmetric.Env(config.env)

    agents = []
    for i in range(env.n_agents):
        agents.append(Actor(i, 7, env.n_agents, lr=0.01))

    # epoch start
    for epoch in range(num_epoch):
        if (epoch + 1) % 500 == 0:
            print("Epoch: ", epoch + 1, "/", num_epoch)
        """The first trajectory generation"""
        trajs = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = None

        while not done:
            if list_obs_next is not None:
                list_obs = list_obs_next

            # decide actions from observations
            list_act, list_act_hot = action_sampling(agents, list_obs, epsilon)

            # give incentivisation
            inctv_to, inctv_from = give_incentivisation(
                agents, list_act_hot, config)
            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, inctv_to)

            # save trajectory
            for agent in agents:
                trajs[agent.id].add(agent.get_obs(), agent.get_action(),
                                    agent.get_action_hot(),
                                    env_rewards[agent.id],
                                    inctv_from[agent.id])

        for agent in agents:
            agent.update_policy(trajs, lr=lr_a)
        """The second trajectory generation"""
        # Generate a new trajectory
        trajs_new = [Trajectory() for _ in range(env.n_agents)]
        list_obs = env.reset()
        list_obs_next = None
        done = False
        result_one_new = 0
        result_two_new = 0

        while not done:
            if list_obs_next is not None:
                list_obs = list_obs_next
            # decide actions from observations
            list_act, list_act_hot = action_sampling(agents, list_obs, epsilon)

            # give incentivisation
            inctv_to, new_inctv_from_others = give_incentivisation(
                agents, list_act_hot, config)

            reward_one_to_two.append(inctv_to[0])
            reward_two_to_one.append(inctv_to[1])

            # execute step
            list_obs_next, env_rewards, done = env.step(list_act, inctv_to)

            for agent in agents:
                trajs_new[agent.id].add(agent.get_obs(), agent.get_action(),
                                        agent.get_action_hot(),
                                        env_rewards[agent.id],
                                        new_inctv_from_others[agent.id])

            result_one_new += env_rewards[0]
            result_two_new += env_rewards[1]

            if done:
                results_one.append(result_one_new)
                results_two.append(result_two_new)

        # compute new log prob act
        log_prob_act_other = compute_log_prob_act_other(
            agents, trajs_new, config)

        for agent in agents:
            agent.update_rewards_giving(trajs, trajs_new, log_prob_act_other)

        for agent in agents:
            agent.update_to_new_params()

        if epsilon > epsilon_end:
            epsilon -= epsilon_step

    return results_one, results_two, reward_one_to_two, reward_two_to_one