def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        attention_shape_n = [
            env.group_attention_input[i].shape for i in range(env.n)
        ]

        group_shape_n = []
        for i in range(env.n):
            current_shape_n = [
                env.group_space_input[i][j].shape for j in range(0, 5)
            ]
            group_shape_n.append(current_shape_n)

        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        group_trainers, attention_traniners = get_group_trainers(
            env, group_shape_n, attention_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        for agent in trainers:
            agent.saver = tf.train.Saver()
        for agent in group_trainers:
            agent.saver = tf.train.Saver()
        for agent in attention_traniners:
            agent.saver = tf.train.Saver()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        final_ep_ag_rewards_0 = []
        final_ep_ag_rewards_1 = []
        final_ep_ag_rewards_2 = []
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        old_attention = []
        old_group = []

        print('Starting iterations...')
        while True:
            # get action
            group_obs = []
            for obs in obs_n:
                group1 = []
                group2 = []
                group3 = []
                group4 = []
                group5 = []
                group1.append([obs[8], obs[12], 0])
                group2.append([obs[10], obs[11], 0])
                group3.append([obs[0], obs[1], obs[9]])
                group4.append([obs[2], obs[4], obs[6]])
                group5.append([obs[3], obs[5], obs[7]])

                group_obs.append(np.squeeze(np.asarray(group1)))
                group_obs.append(np.squeeze(np.asarray(group2)))
                group_obs.append(np.squeeze(np.asarray(group3)))
                group_obs.append(np.squeeze(np.asarray(group4)))
                group_obs.append(np.squeeze(np.asarray(group5)))

            group_output = [
                agent.action(obs)
                for agent, obs in zip(group_trainers, group_obs)
            ]
            g1 = []
            g2 = []
            attention_input = []
            for i in range(0, len(group_output)):
                if i < 5:
                    g1.extend(group_output[i])
                elif i < 10:
                    g2.extend(group_output[i])

            attention_input.append(np.squeeze(np.asarray(g1)))
            attention_input.append(np.squeeze(np.asarray(g2)))

            attention_output = [
                agent.action(obs)
                for agent, obs in zip(attention_traniners, attention_input)
            ]

            if train_step == 0:
                old_group = group_obs
                old_attention = attention_input

            argmax = [np.argmax(attention) for attention in attention_output]

            attention_comm = []
            attention_comm.append(group_output[argmax[0]])
            attention_comm.append(group_output[argmax[1] + 5])

            for i, agent in enumerate(env.agents):
                agent.state.c = attention_comm[i]

            for i in range(0, len(obs_n)):
                obs_n[i] = obs_n[i][:11]
                for j in range(0, len(attention_comm)):
                    if j != i:
                        obs_n[i] = np.append(obs_n[i], attention_comm[j])

            physical_action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            action_n = []
            for phy, com in zip(physical_action_n, attention_comm):
                action_n.append(np.concatenate((phy, com), axis=0))
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n, argmax)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], physical_action_n[i], rew_n[i],
                                 new_obs_n[i], done_n[i], terminal)
            for i, agent in enumerate(attention_traniners):
                agent.experience(attention_input[i], attention_output[i],
                                 rew_n[i], old_attention[i], done_n[i],
                                 terminal)
            for i, agent in enumerate(group_trainers):
                if i < 5:
                    agent.experience(group_obs[i], group_output[i], rew_n[0],
                                     old_group[i], done_n[0], terminal)
                elif i < 10:
                    agent.experience(group_obs[i], group_output[i], rew_n[1],
                                     old_group[i], done_n[1], terminal)

            old_attention = attention_input
            old_group = group_obs
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in attention_traniners:
                agent.preupdate()
            for agent in group_trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)
            for agent in attention_traniners:
                loss = agent.update(attention_traniners, train_step)
            for agent in group_trainers:
                loss = agent.update(group_trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for i, rew in enumerate(agent_rewards):
                    if i % 3 == 0:
                        final_ep_ag_rewards_0.append(
                            np.mean(rew[-arglist.save_rate:]))
                    if i % 3 == 1:
                        final_ep_ag_rewards_1.append(
                            np.mean(rew[-arglist.save_rate:]))
                    if i % 3 == 2:
                        final_ep_ag_rewards_2.append(
                            np.mean(rew[-arglist.save_rate:]))
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:

                # for i, agent in enumerate(comm_trainers):
                #     model_path = arglist.plots_dir + arglist.exp_name + "_" + str(arglist.num_episodes) + '_agent_' + str(i) + 'model.ckpt'
                #     saver.save(U.get_session(), model_path)

                rew_file_name = arglist.plots_dir + arglist.exp_name + "_" + str(
                    arglist.num_episodes) + '_rewards.csv'
                csv1 = pd.DataFrame(final_ep_rewards).to_csv(rew_file_name,
                                                             index=False)

                agrew_file_name = arglist.plots_dir + arglist.exp_name + "_" + str(
                    arglist.num_episodes) + '_agrewards.csv'
                csv2 = pd.DataFrame(final_ep_ag_rewards).to_csv(
                    agrew_file_name, index=False)

                #  entireObs = []
                #  for i, agent in enumerate(comm_trainers):
                #      if i == 1:
                #          entireObs.extend(agent.collectEntrieObs())

                #  agrew_file_name = arglist.plots_dir + arglist.exp_name + "_" + str(arglist.num_episodes) + '_replaybufferObs.csv'
                #  csv3 = pd.DataFrame(entireObs).to_csv(agrew_file_name, index=False)

                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
    return final_ep_rewards, final_ep_ag_rewards, final_ep_ag_rewards_0, final_ep_ag_rewards_1, final_ep_ag_rewards_2
Exemple #2
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
#                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
#                with open(rew_file_name, 'wb') as fp:
#                    pickle.dump(final_ep_rewards, fp)
#                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
#                with open(agrew_file_name, 'wb') as fp:
#                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
Exemple #3
0
def run_loop(agents, env, max_frames=0):
    """A run loop to have agents and an environment interact."""
    total_frames = 0
    start_time = time.time()
    arglist = parse_args()

    action_spec = env.action_spec()
    observation_spec = env.observation_spec()
    for agent in agents:
        agent.setup(observation_spec, action_spec)

    try:
        with U.single_threaded_session():
            timesteps = env.reset()
            for a in agents:
                a.reset()
            for a, timestep in zip(agents, timesteps):
                a.selected_units(timestep)
                obs_shape_n, timestep = a.build_group(timestep, env)
                action_space = [i for i in range(3)]
                action_space_n = []
                agent_rewards = []
                for i in range(a.num_units):
                    agent_rewards.append([0.0])
                    action_space_n.append(action_space)
                trainers = get_trainers(action_space_n, a.num_units,
                                        obs_shape_n, arglist)

                # Initialize
                U.initialize()

                # Load previous results, if necessary
                if arglist.load_dir == "":
                    arglist.load_dir = arglist.save_dir
                if arglist.display or not arglist.restore or arglist.benchmark:
                    print('Loading previous state...')
                    U.load_state(
                        arglist.load_dir)  # sum of rewards for all agents
                final_ep_rewards = []  # sum of rewards for training curve
                final_ep_ag_rewards = []  # agent rewards for training curve
                saver = tf.train.Saver()
                loss_n = []
                train_step = 0
                obs_n, timestep = a.get_obs(timestep, env)
                t_start = time.time()
            print('Starting iterations...')
            while True:
                win_pro = timestep.win_pro
                episode_rewards = timestep.episode_rewards
                if len(win_pro) > 1:
                    data = np.array(win_pro)
                    np.savetxt(arglist.exp_name + '_win_pro.csv',
                               data,
                               delimiter=',')
                if len(loss_n) > 1:
                    data = np.array(loss_n)
                    np.savetxt(arglist.exp_name + '_loss.csv',
                               data,
                               delimiter=',')
                while True:
                    total_frames += 1
                    if isinstance(obs_n, list):
                        obs_n = np.array(obs_n)
                    action_n = [
                        trainer.action(obs)
                        for trainer, obs in zip(trainers, obs_n)
                    ]
                    rew_n = []
                    for i, action in enumerate(action_n):
                        if not timestep:
                            break
                        for agent in agents:
                            if agent.group[i] == True:
                                timestep = agent.select_unit(i, timestep, env)
                                if not timestep:
                                    break
                            timestep = get_action(action, timestep, env)
                            if not timestep:
                                break
                            new_obs_n, timestep = agent.get_obs(timestep, env)
                            rew_n.append(timestep.reward)
                        if max_frames and total_frames >= max_frames:
                            return

                    if not timestep:
                        break

                    if len(new_obs_n) != 5:
                        for i in range(len(new_obs_n), 5):
                            new_obs_n.append([0] * 20)
                    if len(rew_n) != 5:
                        for i in range(len(rew_n), 5):
                            rew_n.append(0)
                    for i, agent in enumerate(trainers):
                        agent.experience(obs_n[i], action_n[i], rew_n[i],
                                         new_obs_n[i])

                    obs_n = new_obs_n
                    for i, rew in enumerate(rew_n):
                        agent_rewards[i][-1] += rew

                    if not arglist.display:

                        train_step += 1
                        # update all trainers, if not in display or benchmark mode
                        loss = None
                        for agent in trainers:
                            agent.preupdate()
                        for agent in trainers:
                            loss = agent.update(trainers, train_step)
                        if isinstance(loss, list):
                            loss_n.append(loss)
                            print('loss:', loss)

                    # save model, display training output
                    if (len(episode_rewards) % arglist.save_rate == 0):
                        U.save_state(arglist.save_dir, saver=saver)
                        # print statement depends on whether or not there are adversaries
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                            .format(
                                train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                        t_start = time.time()
                        # Keep track of final episode reward
                        final_ep_rewards.append(
                            np.mean(episode_rewards[-arglist.save_rate:]))
                        for rew in agent_rewards:
                            final_ep_ag_rewards.append(
                                np.mean(rew[-arglist.save_rate:]))
                    # saves final episode reward for plotting training curve later
                    if len(episode_rewards) > arglist.num_episodes:
                        rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                        with open(rew_file_name, 'wb') as fp:
                            pickle.dump(final_ep_rewards, fp)
                        agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                        with open(agrew_file_name, 'wb') as fp:
                            pickle.dump(final_ep_ag_rewards, fp)
                        print('...Finished total of {} episodes.'.format(
                            len(episode_rewards)))
                        break
                timesteps = env.reset()
                for a in agents:
                    a.reset()
                for a, timestep in zip(agents, timesteps):
                    a.selected_units(timestep)
                    obs_shape_n, timestep = a.build_group(timestep, env)
    except KeyboardInterrupt:
        pass
    finally:
        elapsed_time = time.time() - start_time
        print("Took %.3f seconds for %s steps: %.3f fps" %
              (elapsed_time, total_frames, total_frames / elapsed_time))