Example #1
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        #if not (arglist.display or arglist.restore or arglist.benchmark):
        #    U.save_state(arglist.save_dir, saver=saver)
        #    print("Saved first checkpoint")

        current_game_experiences = []
        t0 = time.time()

        print('Starting iterations...')
        while True:

            new_experiences = load_new_experiences()
            for exp in new_experiences:
                obs_n, action_n, rew_n, new_obs_n, done_n, terminal = exp
                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                     new_obs_n[i], done_n[i], terminal)

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            U.save_state(arglist.save_dir, saver=saver)
Example #2
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)

        # Create experience buffer
        replay_buffer = ReplayBuffer(arglist.num_episodes * arglist.max_episode_len if arglist.benchmark and arglist.save_replay else 1e6)
        min_replay_buffer_len = arglist.batch_size * arglist.max_episode_len

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        # pick random agent from ensemble for each episode
        if arglist.ensemble_choice == 'episode':
            agent_ids = np.random.randint(arglist.ensemble_size, size=len(trainers))
            agents = [trainers[i][agent_id] for i, agent_id in enumerate(agent_ids)]

        print('Starting iterations...')
        while True:
            # pick random agent from ensemble for each timestep
            if arglist.ensemble_choice == 'timestep':
                agent_ids = np.random.randint(arglist.ensemble_size, size=len(trainers))
                agents = [trainers[i][agent_id] for i, agent_id in enumerate(agent_ids)]
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(agents,obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n, agent_ids)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            for i, info in enumerate(info_n):
                agent_info[-1][i].append(info_n['n'])

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
                # pick random agent from ensemble for each episode
                if arglist.ensemble_choice == 'episode':
                    agent_ids = np.random.randint(arglist.ensemble_size, size=len(trainers))
                    agents = [trainers[i][agent_id] for i, agent_id in enumerate(agent_ids)]

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                if train_step >= arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                        if arglist.save_replay:
                            pickle.dump(replay_buffer._storage, fp)
                    break
                continue

            # update all trainers, if not in display or benchmark mode
            # only update every 100 steps and if replay buffer is large enough
            if train_step % 100 == 0 and len(replay_buffer) >= min_replay_buffer_len:
                for i, ensemble in enumerate(trainers):
                    for agent in ensemble:
                        # sample different batch for each agent in ensemble
                        batch_obs_n, batch_act_n, batch_rew_n, batch_obs_next_n, batch_done_n, batch_agent_ids = replay_buffer.sample(arglist.batch_size)
                        batch_obs_n = [batch_obs_n[:, j] for j in range(batch_obs_n.shape[1])]
                        batch_act_n = [batch_act_n[:, j] for j in range(batch_act_n.shape[1])]
                        batch_obs_next_n = [batch_obs_next_n[:, j] for j in range(batch_obs_next_n.shape[1])]
                        # choose random agent from ensemble for target action
                        batch_agents = [random.choice(ensemble) for ensemble in trainers]
                        loss = agent.update(batch_agents, batch_obs_n, batch_act_n, batch_rew_n[:, i], batch_obs_next_n, batch_done_n[:, i])

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
Example #3
0
                 #pdb.set_trace()
                 
                 if arglist.display: continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()

                

                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
Example #4
0
def train(arglist, extra_args=None):
    tf_graph = tf.Graph()
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(graph=tf_graph, config=tf_config):
        # Create environment
        env = make_env(arglist.scenario, arglist)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        if arglist.num_adversaries is None:
            arglist.num_adversaries = len([
                agent for agent in env.agents
                if (hasattr(agent, "adversary") and agent.adversary)
            ])
        arglist.num_adversaries = min(env.n, arglist.num_adversaries)
        num_adversaries = arglist.num_adversaries
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        if os.environ.get("OUTPUT_GRAPH"):
            tf.summary.FileWriter(os.path.join(logger.get_dir(), "tb"),
                                  U.get_session().graph)

        # Load previous results, if necessary
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=None)
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # print("[action] " + ", ".join(["agent {i}: {action}".format(i=i, action=list(action_n[i])) for i in range(len(action_n))]))
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                if arglist.save_render_images:
                    input_file_name = os.path.join(
                        arglist.render_dir,
                        "image-episode_{}-step_%d.png".format(
                            len(episode_rewards)))
                    output_file_name = os.path.join(
                        arglist.render_dir,
                        "video-episode_{}.mp4".format(len(episode_rewards)))
                    command = "ffmpeg -y -r 10 -i {} {}".format(
                        input_file_name, output_file_name)
                    os.system(command)
                    print("Saved render video at {}".format(output_file_name))

                    for episode_step_ in range(episode_step):
                        file_name = os.path.join(
                            arglist.render_dir,
                            "image-episode_{}-step_{}.png".format(
                                len(episode_rewards), episode_step_))
                        if os.path.exists(file_name):
                            os.remove(file_name)

                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = os.path.join(arglist.benchmark_dir,
                                             'benchmark.pkl')
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                if arglist.save_render_images:
                    images = env.render(mode="rgb_array")
                    image = images[0]
                    file_name = os.path.join(
                        arglist.render_dir,
                        "image-episode_{}-step_{}.png".format(
                            len(episode_rewards), episode_step))
                    plt.imsave(file_name, image)
                    print("Saved render image at {}".format(file_name))
                else:
                    env.render(mode="human")
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(os.path.join(
                    arglist.save_dir,
                    "checkpoint-episode_{}".format(len(episode_rewards))),
                             saver=saver)

            # print training scalars
            if terminal and ((len(episode_rewards) % arglist.print_rate == 0)
                             or
                             (len(episode_rewards) % arglist.save_rate == 0)):
                # print statement depends on whether or not there are adversaries
                logger.log("Time: {}".format(
                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
                logger.logkv("steps", train_step)
                logger.logkv("episodes", len(episode_rewards))
                logger.logkv("mean_episode_reward",
                             np.mean(episode_rewards[-arglist.save_rate:]))
                if num_adversaries == 0:
                    # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()),
                    #     train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                    pass
                else:
                    for agent_index in range(len(agent_rewards)):
                        logger.logkv(
                            "agent_{}_episode_reward".format(agent_index),
                            np.mean(agent_rewards[agent_index]
                                    [-arglist.save_rate:]))
                    # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()),
                    #     train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                    #     [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                logger.logkv("time", round(time.time() - t_start, 3))
                logger.dumpkvs()
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = os.path.join(arglist.plots_dir, 'rewards.pkl')
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = os.path.join(arglist.plots_dir,
                                               'average_rewards.pkl')
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #5
0
def train(arglist):
    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        saver = tf.train.Saver()
        # Initialize
        U.initialize()
        summary_writer = tf.summary.FileWriter(arglist.summary_dir, sess.graph)
        summary_placeholders, update_ops, summary_op = setup_summary()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
            #saver.restore(sess, "/home/sugon/Peixian/maddpg_peixian/maddpg/experiments/tmp/policy/simple_comm_-4166440")
            #print ("susessfully restor")

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=3)
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        adversary_rewards = 0.0
        goodagent_rewards = 0.0

        print('Starting iterations...')
        while True:
            #input('...')
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                #print (i,":",rew_n[i])
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew
                if i < num_adversaries:
                    adversary_rewards += rew
                else:
                    goodagent_rewards += rew

            if done or terminal:
                if done:
                    print("*" * 20)
                    print("done:", episode_step)

                stats = [adversary_rewards, episode_step, goodagent_rewards]
                for i in range(len(stats)):
                    sess.run(
                        update_ops[i],
                        feed_dict={summary_placeholders[i]: float(stats[i])})
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str,
                                           len(episode_rewards) + 1)

                obs_n = env.reset()
                episode_step = 0
                adversary_rewards = 0.0
                goodagent_rewards = 0.0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if (done or terminal) and (len(episode_rewards) % arglist.save_rate
                                       == 0):
                U.save_state(arglist.save_dir, train_step, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #6
0
def train(arglist):
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        obs_n = env.reset(
        )  # so that env.observation_space is initialized so trainers can be initialized
        # Create agent trainers
        num_adversaries = arglist.num_adversaries
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        print("env.observation_space:", env.observation_space)
        print("num adversaries: ", num_adversaries, ", env.n (num agents): ",
              env.n)

        #need to ensure that the trainer is in correct order. pacman in front
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir + ("{}".format(
                arglist.load_episode))
        if arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
        if arglist.display and arglist.load:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = [[] for i in range(env.n)
                               ]  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=None)
        episode_step = 0
        train_step = 0
        total_win = [0]
        final_win = []
        total_lose = [0]
        final_lose = []
        t_start = time.time()
        loss_list = {}
        for i in range(env.n):
            loss_list[i] = [[] for i in range(6)]

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done, info_n, win, lose = env.step(action_n)
            episode_step += 1
            terminal = (episode_step >= arglist.max_episode_len)
            # print("obs_n", obs_n)
            # print("new_obs_n", new_obs_n)
            #print("action_n", action_n)
            # print("rew_n",episode_step, rew_n)
            # print("done", done)
            # print("terminal", terminal)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done, terminal)
            obs_n = new_obs_n
            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew
            if done or terminal:
                if arglist.display:
                    env.render()
                obs_n = env.reset()
                episode_step = 0
                if win:
                    total_win[-1] += 1
                if lose:
                    total_lose[-1] += 1
                total_win.append(0)
                total_lose.append(0)
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1
            # if train_step % 1000 == 0:
            #     print(train_step)
            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None

            for agent in trainers:
                agent.preupdate()
            for ind, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if train_step % 10000 == 0 and loss != None:
                    for i in range(len(loss)):
                        loss_list[ind][i].append(loss[i])

            # save model, display training output
            if (terminal or done) and (len(episode_rewards) % arglist.save_rate
                                       == 0):
                saving = arglist.save_dir + (
                    "{}".format(0 + len(episode_rewards))
                )  #TODO why append this
                U.save_state(saving, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, number of wins {}, number of lose {}, "
                        "time: {}".format(
                            train_step, len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate:]), [
                                np.mean(rew[-arglist.save_rate:])
                                for rew in agent_rewards
                            ], np.sum(total_win[-arglist.save_rate:]),
                            np.sum(total_lose[-arglist.save_rate:]),
                            round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                final_win.append(np.sum(total_win[-arglist.save_rate:]))
                final_lose.append(np.sum(total_lose[-arglist.save_rate:]))

                ep_reward_df = pd.DataFrame(final_ep_rewards)
                ep_ag_reward_df = pd.DataFrame(final_ep_ag_rewards)
                win_df = pd.DataFrame(final_win)
                lose_df = pd.DataFrame(final_lose)
                for i in range(env.n):
                    trainer_loss_df = pd.DataFrame(loss_list[i]).transpose()
                    trainer_loss_df.to_csv(arglist.plots_dir +
                                           arglist.exp_name +
                                           '_trainer_loss_df_{}.csv'.format(i))

                ep_reward_df.to_csv(arglist.plots_dir + arglist.exp_name +
                                    '_rewards.csv')
                ep_ag_reward_df.to_csv(arglist.plots_dir + arglist.exp_name +
                                       '_agrewards.csv')
                win_df.to_csv(arglist.plots_dir + arglist.exp_name +
                              '_win_df.csv')
                lose_df.to_csv(arglist.plots_dir + arglist.exp_name +
                               '_lose_df.csv')

                for i, rew in enumerate(agent_rewards):
                    final_ep_ag_rewards[i].append(
                        np.mean(rew[-arglist.save_rate:]))
            # saves final episode reward for plotting training curve later

            if len(episode_rewards) > arglist.num_episodes:
                # rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                # with open(rew_file_name, 'wb') as fp:
                #     pickle.dump(final_ep_rewards, fp)
                # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                # with open(agrew_file_name, 'wb') as fp:
                #     pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #7
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        board_write_path = './board/' + datetime.now().strftime("%Y%m%d_%H%M%S")
        os.makedirs(board_write_path)
        board_writer = tf.summary.FileWriter(board_write_path)

        trainers = get_trainers(env, obs_shape_n, arglist, board_writer)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        evaluate_rewards = []
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
            # environment step
            action_n_saved = deepcopy(action_n)

            if arglist.display:
                for idx, (agent, obs) in enumerate(zip(trainers, obs_n)):
                    action_result = agent.p_debug['p_values'](obs[None])[0]
                    print("agent_%d" % idx, action_result)

            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            action_n = action_n_saved
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            if arglist.display:
                continue

            # update all trainers, if not in display or benchmark mode
            if train_step % 100 == 0 and len(trainers[0].replay_buffer) >= trainers[0].max_replay_buffer_len:
                loss = None
                replay_sample_index = trainers[0].get_memory_index()

                obs_n_sampled = []
                obs_next_n_sampled = []
                act_n_sampled = []
                for agent in trainers:
                    agent.set_memory_index(replay_sample_index)
                    obs_sampled, act_sampled, _, obs_next_sampled, _ = agent.get_replay_data()
                    obs_n_sampled.append(obs_sampled)
                    obs_next_n_sampled.append(obs_next_sampled)
                    act_n_sampled.append(act_sampled)
                target_act_next_n = []
                for agent in trainers:
                    target_act_next_n.append(agent.get_target_act(obs_next_n_sampled))

                for agent in trainers:
                    loss = agent.update(train_step, obs_n_sampled, act_n_sampled, obs_next_n_sampled,
                                        target_act_next_n)

            import math
            if math.isnan(episode_rewards[-1]):
                print("NaN occurred! ")
                break

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                    train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                    [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

                evaluate_rewards.append(evaluate(arglist, trainers, is_toy=True))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                with open(arglist.plots_dir + arglist.exp_name + "_evaluate_rewards.pkl", 'wb') as fp:
                    pickle.dump(evaluate_rewards, fp)
                break
Example #8
0
    def update(self, arglist, obs_n, rew_n, done_n, info_n, terminal):
        # info_n is false only when the very first data was created
        if info_n != False:
            done = all(done_n)

            # collect experience
            for i, agent in enumerate(self.trainers):
                # do this every iteration
                if arglist.critic_lstm and arglist.actor_lstm:
                    agent.experience(
                        self.prev_obs_n[i],
                        self.action_n[i],
                        rew_n[i],
                        obs_n[i],
                        done_n[i],  # terminal,
                        self.p_in_c_n[i][0],
                        self.p_in_h_n[i][0],
                        self.p_out_c_n[i][0],
                        self.p_out_h_n[i][0],
                        self.q_in_c_n[i][0],
                        self.q_in_h_n[i][0],
                        self.q_out_c_n[i][0],
                        self.q_out_h_n[i][0],
                        self.new_episode)
                elif arglist.critic_lstm:
                    agent.experience(
                        self.prev_obs_n[i],
                        self.action_n[i],
                        rew_n[i],
                        obs_n[i],
                        done_n[i],  # terminal,
                        self.q_in_c_n[i][0],
                        self.q_in_h_n[i][0],
                        self.q_out_c_n[i][0],
                        self.q_out_h_n[i][0],
                        self.new_episode)
                elif arglist.actor_lstm:
                    agent.experience(
                        self.prev_obs_n[i],
                        self.action_n[i],
                        rew_n[i],
                        obs_n[i],
                        done_n[i],  # terminal,
                        self.p_in_c_n[i][0],
                        self.p_in_h_n[i][0],
                        self.p_out_c_n[i][0],
                        self.p_out_h_n[i][0],
                        self.new_episode)
                else:
                    agent.experience(
                        self.prev_obs_n[i],
                        self.action_n[i],
                        rew_n[i],
                        obs_n[i],
                        done_n[i],  # terminal,
                        self.new_episode)

            # Adding rewards
            if arglist.tracking:
                for i, a in enumerate(self.trainers):
                    a.tracker.record_information("ag_reward", rew_n[i])
                    a.tracker.record_information("team_dist_reward",
                                                 info_n["team_dist"][i])
                    a.tracker.record_information("team_diff_reward",
                                                 info_n["team_diff"][i])

            # Closing graph writer
            if arglist.graph:
                self.writer.close()
            for i, rew in enumerate(rew_n):
                self.episode_rewards[-1] += rew
                self.agent_rewards[i][-1] += rew

            # If an episode was finished, reset internal values
            if done or terminal:
                self.new_episode = True
                # reset trainers
                if arglist.actor_lstm or arglist.critic_lstm:
                    for agent in self.trainers:
                        agent.reset_lstm()
                if arglist.tracking:
                    for agent in self.trainers:
                        agent.tracker.reset()
                self.episode_rewards.append(0)
                for a in self.agent_rewards:
                    a.append(0)
                self.agent_info.append([[]])
            else:
                self.new_episode = False

            # increment global step counter
            self.train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    self.agent_info[-1][i].append(info_n['n'])
                if self.train_step > arglist.benchmark_iters and (done
                                                                  or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(self.agent_info[:-1], fp)
                    return
            # otherwise training
            else:
                # update all trainers, if not in display or benchmark mode
                loss = None

                # get same episode sampling
                if arglist.sync_sampling:
                    inds = [
                        random.randint(
                            0,
                            len(self.trainers[0].replay_buffer._storage) - 1)
                        for i in range(arglist.batch_size)
                    ]
                else:
                    inds = None

                for agent in self.trainers:
                    # if arglist.lstm:
                    #     agent.preupdate(inds=inds)
                    # else:
                    agent.preupdate(inds)
                for agent in self.trainers:
                    loss = agent.update(self.trainers, self.train_step)
                    if loss is None: continue

                # save model, display training output
                if terminal and (len(self.episode_rewards) % arglist.save_rate
                                 == 0):
                    U.save_state(arglist.save_dir, saver=self.saver)
                    # print statement depends on whether or not there are adversaries
                    if self.num_adversaries == 0:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                            .format(
                                self.train_step, len(self.episode_rewards),
                                np.mean(
                                    self.episode_rewards[-arglist.save_rate:]),
                                round(time.time() - self.t_start, 3)))
                    else:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                            .format(
                                self.train_step, len(self.episode_rewards),
                                np.mean(
                                    self.episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in self.agent_rewards
                                ], round(time.time() - self.t_start, 3)))
                    self.t_start = time.time()
                    # Keep track of final episode reward
                    self.final_ep_rewards.append(
                        np.mean(self.episode_rewards[-arglist.save_rate:]))
                    for rew in self.agent_rewards:
                        self.final_ep_ag_rewards.append(
                            np.mean(rew[-arglist.save_rate:]))

        if arglist.actor_lstm:
            # get critic input states
            self.p_in_c_n, self.p_in_h_n = get_lstm_states(
                'p', self.trainers)  # num_trainers x 1 x 1 x 64
        if arglist.critic_lstm:
            self.q_in_c_n, self.q_in_h_n = get_lstm_states(
                'q', self.trainers)  # num_trainers x 1 x 1 x 64

        # get action
        self.action_n = [
            agent.action(obs) for agent, obs in zip(self.trainers, obs_n)
        ]
        if arglist.critic_lstm:
            # get critic output states
            p_states = [self.p_in_c_n, self.p_in_h_n
                        ] if arglist.actor_lstm else []
            update_critic_lstm(self.trainers, obs_n, self.action_n, p_states)
            self.q_out_c_n, self.q_out_h_n = get_lstm_states(
                'q', self.trainers)  # num_trainers x 1 x 1 x 64
        if arglist.actor_lstm:
            self.p_out_c_n, self.p_out_h_n = get_lstm_states(
                'p', self.trainers)  # num_trainers x 1 x 1 x 64

        self.prev_obs_n = obs_n

        return self.action_n
Example #9
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print(
            "Using good policy {} and adv policy {}".format(
                arglist.good_policy, arglist.adv_policy
            )
        )

        np.seterr(all="raise")  # define before your code.

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print("Loading previous state...")
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print("making logger")
        tb_configure("logs/" + str(arglist.exp_name) + "_" + str(datetime.now()))
        print("Starting iterations...")
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = episode_step >= arglist.max_episode_len
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(
                    obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal
                )
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[
                    -1
                ] += rew  ## / self.n (?) Do we want this to be average across all agents?
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n["n"])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + ".pkl"
                    print("Finished benchmarking, now saving...")
                    with open(file_name, "wb") as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)
            # log metrics

            tb_log_value("episode_reward", episode_rewards[train_step - 1], train_step)
            tb_log_value(
                "first_agent_reward", agent_rewards[0][train_step - 1], train_step
            )
            tb_log_value(
                "second_agent_reward", agent_rewards[1][train_step], train_step
            )
            if loss is not None:
                loss_to_log = loss
            else:
                loss_to_log = -100
                tb_log_value("loss", loss_to_log, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                print("made it into if terminal and len(episde)")
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                            train_step,
                            len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate :]),
                            round(time.time() - t_start, 3),
                        )
                    )
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                            train_step,
                            len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate :]),
                            [
                                np.mean(rew[-arglist.save_rate :])
                                for rew in agent_rewards
                            ],
                            round(time.time() - t_start, 3),
                        )
                    )

                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate :]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate :]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + "_rewards.pkl"
                with open(rew_file_name, "wb") as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = (
                    arglist.plots_dir + arglist.exp_name + "_agrewards.pkl"
                )
                with open(agrew_file_name, "wb") as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print("...Finished total of {} episodes.".format(len(episode_rewards)))
                break
Example #10
0
def train(arglist):

    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        episode_accuracy = [[] for i in range(env.n)]
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []
        final_ep_accurancy =[]  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        record_accurancy = [[]] * env.n
        #initialize act trajectories
        act_trajs = []
        for i in range(env.n):
            act_trajs.append(collections.deque(np.zeros((arglist.timestep, env.action_space[0].n)), maxlen = arglist.timestep) )
      
        print('Starting iterations...')
        while True:
            # get action

            act_traj_n = get_traj_n(act_trajs)
            
            if arglist.adv_i3 == 1 and arglist.good_i3 == 1:
                intent_n = [agent.intent(obs, act_traj) for agent, obs, act_traj in zip(trainers, obs_n, act_traj_n)]
                action_n = [agent.action(obs, intent) for agent, obs,intent in zip(trainers,obs_n, intent_n)]
                # environment step
                new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                episode_step += 1
                done = all(done_n)
                terminal = (episode_step >= arglist.max_episode_len)
                # collect experience

                for i in range(len(act_trajs)):
                    act_trajs[i].append(action_n[i])

                act_traj_next_n = get_traj_n(act_trajs)
                intent_next_n = [agent.intent(obs, act_traj) for agent, obs, act_traj in zip(trainers, new_obs_n, act_traj_next_n)]

                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], act_traj_n[i], intent_n[i],act_traj_next_n[i], intent_next_n[i], done_n[i], terminal)
                    if arglist.onpolicy_i == 1:
                        i_loss = agent.onpolicy_train_i(obs_n, act_traj_n,action_n )
                        episode_accuracy[i].append(i_loss)
            elif arglist.adv_i3 == 1 and arglist.good_i3 == 0:
                #adv use I3 good use maddpg
                intent_n = []
                action_n = []
                for i in range(len(trainers)):
                    if i < arglist.num_adversaries:
                        intent = trainers[i].intent(obs_n[i], act_traj_n[i])
                        action = trainers[i].action(obs_n[i], intent)
                        action_n.append(action)
                        intent_n.append(intent)
                    else:
                        action = trainers[i].action(obs_n[i])    
                        action_n.append(action)
                        intent_n.append(np.zeros((arglist.timestep *  (env.action_space[0].n-1))))

                # environment step
                new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                episode_step += 1
                done = all(done_n)
                terminal = (episode_step >= arglist.max_episode_len)

                for i in range(len(act_trajs)):
                    act_trajs[i].append(action_n[i])

                act_traj_next_n = get_traj_n(act_trajs)
                intent_next_n = []
                for i in range(len(trainers)):
                    if i < arglist.num_adversaries:
                        intent_next_n.append(trainers[i].intent(new_obs_n[i], act_traj_next_n[i]))
                    else:
                        intent_next_n.append(np.zeros((arglist.timestep *  (env.action_space[0].n-1))))  
                
                for i in range(len(trainers)):
                    if i < arglist.num_adversaries:
                        trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], act_traj_n[i], intent_n[i],act_traj_next_n[i], intent_next_n[i], done_n[i], terminal)
                        if arglist.onpolicy_i == 1:
                            i_loss = trainers[i].onpolicy_train_i(obs_n, act_traj_n,action_n)
                            episode_accuracy[i].append(i_loss)
                    else:
                        trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)    
            elif arglist.good_i3 == 1 and arglist.adv_i3 ==0:
                #adv use I3 good use maddpg
                intent_n = []
                action_n = []
                for i in range(len(trainers)):
                    if i >=arglist.num_adversaries:
                        intent = trainers[i].intent(obs_n[i], act_traj_n[i])
                        action = trainers[i].action(obs_n[i], intent)
                        action_n.append(action)
                        intent_n.append(intent)
                    else:
                        action = trainers[i].action(obs_n[i])    
                        action_n.append(action)
                        intent_n.append(np.zeros((arglist.timestep *  (env.action_space[0].n-1))))

                # environment step
                new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                episode_step += 1
                done = all(done_n)
                terminal = (episode_step >= arglist.max_episode_len)

                for i in range(len(act_trajs)):
                    act_trajs[i].append(action_n[i])

                act_traj_next_n = get_traj_n(act_trajs)
                intent_next_n = []
                for i in range(len(trainers)):
                    if i  >= arglist.num_adversaries:
                        intent_next_n.append(trainers[i].intent(new_obs_n[i], act_traj_next_n[i]))
                    else:
                        intent_next_n.append(np.zeros((arglist.timestep *  (env.action_space[0].n-1))))
                
                for i in range(len(trainers)):
                    if i  >= arglist.num_adversaries:
                        trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], act_traj_n[i], intent_n[i],act_traj_next_n[i], intent_next_n[i], done_n[i], terminal)
                        if arglist.onpolicy_i == 1:
                            i_loss = trainers[i].onpolicy_train_i(obs_n, act_traj_n,action_n )
                            episode_accuracy[i].append(i_loss)
                    else:
                        trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)    
            else:
                action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] 
                new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                episode_step += 1
                done = all(done_n)
                terminal = (episode_step >= arglist.max_episode_len)
                # collect experience
                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)

            obs_n = new_obs_n
          
            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)

                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.5)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)
                
            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))            #     print("-----------------------------")
                
                for iloss in episode_accuracy:
                    if len(iloss) < arglist.save_rate:
                        continue
                    else:
                        final_ep_accurancy.append(np.mean(iloss[-arglist.save_rate]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + str(arglist.seed) + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + str(arglist.seed) + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                acc_file = arglist.plots_dir + arglist.exp_name + str(arglist.seed) + '_accurancy.pkl'    
                with open(acc_file, 'wb') as fp:
                    pickle.dump(final_ep_accurancy, fp)    
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
Example #11
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist, arglist.benchmark)

        # Create agent trainers
        obs_shape_n = [env.observation_space.shape for i in range(env.n)]
        trainers = get_trainers(env, obs_shape_n, arglist)
        print('Using good policy {}'.format(arglist.good_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        done = 0
        current_player_index = 0

        no_op_actions = False

        print('Starting iterations...')
        while True:
            # get action
            current_player_obs = np.asarray(obs_n)
            original_action = trainers[current_player_index].action(
                current_player_obs)

            if (no_op_actions):
                action = np.random.choice(np.linspace(0,
                                                      env.action_space.n - 1,
                                                      num=env.action_space.n,
                                                      dtype=int),
                                          1,
                                          p=original_action)[0]
                mask = env.getValidActions()

                while (mask[action] == 0):
                    action = np.random.choice(np.linspace(
                        0,
                        env.action_space.n - 1,
                        num=env.action_space.n,
                        dtype=int),
                                              1,
                                              p=original_action)[0]
            else:
                # get action mask
                mask = env.getValidActions()
                # zero out invalid options
                masked_actions = mask * original_action
                # normalize
                masked_actions = masked_actions / np.nansum(masked_actions)
                # Get action with given probability
                if (np.isnan(masked_actions).any()):
                    print(current_player_obs)
                    print(masked_actions)
                    print(np.nansum(masked_actions))
                    print(original_action)
                try:
                    action = np.random.choice(np.linspace(
                        0,
                        env.action_space.n - 1,
                        num=env.action_space.n,
                        dtype=int),
                                              1,
                                              p=masked_actions)[0]
                except:
                    print("Exception: choosing random action")
                    action = np.random.choice(
                        np.linspace(0,
                                    env.action_space.n - 1,
                                    num=env.action_space.n,
                                    dtype=int), 1)[0]

            new_obs, rew, done, info = env.step(action)

            #trainers[current_player_index].experience(current_player_obs, original_action, mask, rew, new_obs, done)
            trainers[current_player_index].experience(current_player_obs,
                                                      masked_actions, mask,
                                                      rew, new_obs, done)

            current_player_index += 1
            if (current_player_index >= len(trainers)):
                current_player_index = 0

            obs_n = new_obs

            episode_rewards[-1] += rew
            agent_rewards[current_player_index][-1] += rew

            if done:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
                current_player_index = 0

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)
                if (loss is not None and agent.sleep_regimen
                        and agent.agent_mic != 0 and train_step % 100
                        == 0):  # Change sleep frequency here if desired
                    original_policy_loss = loss[1]
                    new_loss = agent.update(trainers,
                                            train_step,
                                            sleeping=True)[1]
                    sleep_iteration = 0
                    while ((sleep_iteration < 10)
                           and (new_loss < original_policy_loss * 1.05)):
                        new_loss = agent.update(trainers,
                                                train_step,
                                                sleeping=True)[1]
                        sleep_iteration += 1
                        #print("sleep walking")

            # save model, display training output
            if done and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                print(
                    "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                    .format(train_step, len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate:]),
                            round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                print(arglist.plots_dir)
                print(arglist.exp_name)

                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #12
0
def play(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')

        # create world
        world = scenario.make_world()
        # create multiagent environment
        env = MultiAgentEnv(world,
                            scenario.reset_world,
                            scenario.reward,
                            scenario.observation,
                            info_callback=None,
                            shared_viewer=True)
        env.window_pos = 'right'
        # render call to create viewer window (necessary only for interactive policies)
        env.render()
        # create interactive policies for one agent
        policy = InteractivePolicy(env, -1)
        # execution loop
        obs_n = env.reset()
        while True:
            # query for action from each agent's policy
            act_n = [agent.action(obs)
                     for agent, obs in zip(trainers, obs_n)]  # trained policy
            act_n[-1] = policy.action(obs_n[-1])  # interactive keyboard policy
            # step environment
            new_obs_n, reward_n, done_n, _ = env.step(act_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], act_n[i], reward_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(reward_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for game over
            try:
                if scenario.game_over:
                    sys.exit(0)
            except AttributeError:
                pass

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(_):
                    agent_info[-1][i].append(_['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # render all agent views
            time.sleep(arglist.delay)
            env.render()
            # display rewards
            for agent in env.world.agents:
                pass  # print(agent.name + " reward: %0.3f" % env._get_reward(agent))

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                U.save_state(arglist.plots_dir, saver=saver)
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #13
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))
        agents = copy.copy(trainers)

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        if (arglist.record != False):
            writer = skvideo.io.FFmpegWriter("{}.avi".format(arglist.record))
        while True:
            # shuffle agents to prevent them from learning fixed strategy
            if not arglist.benchmark and arglist.shuffle == 'timestep':
                random.shuffle(agents)
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(agents, obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            #print([ag.state.p_pos for ag in env.agents])
            for i, agent in enumerate(agents):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            for i, info in enumerate(info_n):
                agent_info[-1][i].append(info_n['n'])

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.01)
                x = env.render(mode='rgb_array')
                if (arglist.record != False):
                    LM = [ag.state.p_pos for ag in env.world.landmarks]
                    LM = [FixPosition(j, 10, 10) for j in LM]
                    AP = [ag.state.p_pos for ag in env.agents]
                    AP = [FixPosition(j) for j in AP]
                    img = np.copy(x[0])
                    img = AddTextToImage(
                        img,
                        text=['Agent {}', 'Agent {}', 'Agent {}'],
                        color=(0, 0, 255),
                        pos=AP)
                    img = AddTextToImage(img,
                                         text=['LM{}', 'LM{}', 'LM{}'],
                                         pos=LM,
                                         color=(255, 0, 0))
                    writer.writeFrame(img)

            if done or terminal:
                if (arglist.record != False):
                    writer.close()
                    exit()
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
                # shuffle agents to prevent them from learning fixed strategy
                if not arglist.benchmark and arglist.shuffle == 'episode':
                    random.shuffle(agents)

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                if train_step >= arglist.benchmark_iters and (done
                                                              or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                        if arglist.save_replay:
                            # save in original order
                            for i, agent in enumerate(trainers):
                                pickle.dump(agent.replay_buffer._storage, fp)
                    break
                continue

            # update all agents, if not in display or benchmark mode
            loss = None
            for agent in agents:
                agent.preupdate()
            for agent in agents:
                loss = agent.update(agents, train_step)
                # if shared model, train only once
                if arglist.shared:
                    break

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #14
0
def learn_old(
        env,
        arglist,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        save_interval=100,
        **network_kwargs):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        # train_step = 0
        t_start = time.time()
        epinfobuf = deque(maxlen=100)

        print('Starting iterations...')
        total_timesteps = arglist.num_episodes * arglist.max_episode_len
        for train_step in range(1, total_timesteps + 1):
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = any(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                # save episode info
                epinfobuf.append({
                    "r": episode_rewards[-1],
                    "l": episode_step,
                    "t": round(time.time() - t_start, 6)
                })
                # reset episode variables
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            if train_step % arglist.log_interval == 0:
                # logger.logkv(Config.tensorboard_rootdir+"serial_timesteps", train_step)
                # logger.logkv(Config.tensorboard_rootdir+"num_update", update)
                logger.logkv(Config.tensorboard_rootdir + "total_timesteps",
                             train_step)
                logger.logkv(Config.tensorboard_rootdir + "current_episode",
                             int(train_step / arglist.max_episode_len))
                # logger.logkv(Config.tensorboard_rootdir+"fps", fps)
                # logger.logkv(Config.tensorboard_rootdir+"explained_variance", float(ev))
                logger.logkv(Config.tensorboard_rootdir + 'ep_reward_mean',
                             safemean([epinfo['r'] for epinfo in epinfobuf]))
                logger.logkv(Config.tensorboard_rootdir + 'ep_length',
                             safemean([epinfo['l'] for epinfo in epinfobuf]))
                logger.logkv(Config.tensorboard_rootdir + 'time_elapsed',
                             round(time.time() - t_start, 6))
                # for (lossval, lossname) in zip(lossvals, model.loss_names):
                #     logger.logkv(Config.tensorboard_rootdir+lossname, lossval)
                if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                    logger.dumpkvs()

            # increment global step counter
            # train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

                # log loss info for each agent
                if (train_step % arglist.log_interval == 0) and loss:
                    lossvals = [
                        np.mean(data, axis=0)
                        if isinstance(data, list) else data for data in loss
                    ]
                    for (lossval, lossname) in zip(lossvals, agent.loss_names):
                        log_key = "{}{}/{}".format(Config.tensorboard_rootdir,
                                                   lossname, agent.name)
                        logger.logkv(log_key, lossval)

            # save model if at save_rate step or if its the first train_step
            save_model = arglist.save_rate and (
                (train_step % arglist.save_rate == 0) or
                (train_step == total_timesteps))
            # only save model if logger dir specified and current node rank is 0 (multithreading)
            save_model &= logger.get_dir() and (MPI is None or
                                                MPI.COMM_WORLD.Get_rank() == 0)
            if save_model:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
                os.makedirs(checkdir, exist_ok=True)
                savepath = osp.join(checkdir, '%.5i' % train_step)
                print('Saving to', savepath)
                U.save_state(savepath, saver=saver)
                # model.save(savepath)
        env.close()
Example #15
0
def train(env, arglist, trainers):

    episode_rewards = [0.0]  # sum of rewards for all agents
    agent_rewards = [[0.0]
                     for _ in range(env.n)]  # individual agent reward
    agent_info = [[[]]]  # placeholder for benchmarking info
    saver = tf.train.Saver()
    obs_n = env.reset()
    episode_step = 0
    train_step = 0
    t_start = time.time()

    print('Starting iterations...')
    while True:
        # get action
        action_n = [agent.action(obs)
                    for agent, obs in zip(trainers, obs_n)]
        # environment step
        new_obs_n, rew_n, done_n, info_n = env.step(action_n)
        episode_step += 1
        done = all(done_n)
        terminal = (episode_step >= arglist.max_episode_len)
        # collect experience
        for i, agent in enumerate(trainers):
            agent.experience(
                obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
        obs_n = new_obs_n

        for i, rew in enumerate(rew_n):
            episode_rewards[-1] += rew
            agent_rewards[i][-1] += rew

        if done or terminal:
            obs_n = env.reset()
            episode_step = 0
            episode_rewards.append(0)
            for a in agent_rewards:
                a.append(0)
            agent_info.append([[]])

        # increment global step counter
        train_step += 1

        # for benchmarking learned policies
        if arglist.benchmark:
            for i, info in enumerate(info_n):
                agent_info[-1][i].append(info_n['n'])
            if train_step > arglist.benchmark_iters and (done or terminal):
                file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                print('Finished benchmarking, now saving...')
                with open(file_name, 'wb') as fp:
                    pickle.dump(agent_info[:-1], fp)
                break
            continue

        # for displaying learned policies
        if arglist.display:
            time.sleep(0.1)
            env.render()
            continue

        # update all trainers, if not in display or benchmark mode
        for agent in trainers:
            agent.preupdate()
        for agent in trainers:
            loss = agent.update(trainers, train_step)

        episode = len(episode_rewards) + arglist.restore_episode
        # save model, display training output
        if (done or terminal) and (episode % arglist.save_rate == 0):
            mean_reward = np.mean(episode_rewards[-arglist.save_rate:])
            agents_episode_rewards = [
                np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards]
            time_spent = round(time.time()-t_start, 3)

            U.save_state(
                f"{arglist.save_dir}/{arglist.exp_name}/episode_{episode}/model", saver=saver)
            # print statement depends on whether or not there are adversaries
            if num_adversaries == 0:
                print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                    train_step, episode, mean_reward, time_spent))
            else:
                print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                    train_step, episode, mean_reward, agents_episode_rewards, time_spent))
            t_start = time.time()

            # Keep track of rewards
            save_rewards(arglist, episode, mean_reward,
                         agents_episode_rewards)

        # saves final episode reward for plotting training curve later
        if episode >= arglist.num_episodes:
            print('...Finished total of {} episodes.'.format(episode))
            break
Example #16
0
def train(arglist):
    # To make sure that training and testing are based on diff seeds
    if arglist.restore:
        create_seed(np.random.randint(2))
    else:
        create_seed(arglist.seed)

    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        if arglist.analysis:
            print("Starting analysis on {}...".format(arglist.analysis))
            if arglist.analysis != 'video':
                analyze.run_analysis(arglist, env, trainers)
            return # should be a single run

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        new_episode = True # start of a new episode (used for replay buffer)
        start_saving_comm = False

        if arglist.graph:
            print("Setting up graph writer!")
            writer = tf.summary.FileWriter("learning_curves/graph",sess.graph)

        print('Starting iterations...')
        while True:
            if arglist.actor_lstm:
                # get critic input states
                p_in_c_n, p_in_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64
            if arglist.critic_lstm:
                q_in_c_n, q_in_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64

            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
            if arglist.critic_lstm:
                # get critic output states
                p_states = [p_in_c_n, p_in_h_n] if arglist.actor_lstm else []
                update_critic_lstm(trainers, obs_n, action_n, p_states)
                q_out_c_n, q_out_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64
            if arglist.actor_lstm:
                p_out_c_n, p_out_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64

            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                num_episodes = len(episode_rewards)
                # do this every iteration
                if arglist.critic_lstm and arglist.actor_lstm:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    p_in_c_n[i][0], p_in_h_n[i][0],
                                    p_out_c_n[i][0], p_out_h_n[i][0],
                                    q_in_c_n[i][0], q_in_h_n[i][0],
                                    q_out_c_n[i][0], q_out_h_n[i][0], new_episode)
                elif arglist.critic_lstm:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    q_in_c_n[i][0], q_in_h_n[i][0],
                                    q_out_c_n[i][0], q_out_h_n[i][0],new_episode)
                elif arglist.actor_lstm:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    p_in_c_n[i][0], p_in_h_n[i][0],
                                    p_out_c_n[i][0], p_out_h_n[i][0],
                                    new_episode)
                else:
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                    new_obs_n[i], done_n[i], # terminal,
                                    new_episode)

                obs_n = new_obs_n

            # Adding rewards
            if arglist.tracking:
                for i, a in enumerate(trainers):
                    if arglist.num_episodes - len(episode_rewards) <= 1000:
                        a.tracker.record_information("goal", np.array(env.world.landmarks[0].state.p_pos))
                        a.tracker.record_information("position",np.array(env.world.agents[i].state.p_pos))
                    a.tracker.record_information("ag_reward", rew_n[i])
                    a.tracker.record_information("team_dist_reward", info_n["team_dist"][i])
                    a.tracker.record_information("team_diff_reward", info_n["team_diff"][i])

            # Closing graph writer
            if arglist.graph:
                writer.close()
            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                new_episode = True
                num_episodes = len(episode_rewards)
                obs_n = env.reset()
                # reset trainers
                if arglist.actor_lstm or arglist.critic_lstm:
                    for agent in trainers:
                        agent.reset_lstm()
                if arglist.tracking:
                    for agent in trainers:
                        agent.tracker.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
            else:
                new_episode=False

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None

            # get same episode sampling
            if arglist.sync_sampling:
                inds = [random.randint(0, len(trainers[0].replay_buffer._storage)-1) for i in range(arglist.batch_size)]
            else:
                inds = None

            for agent in trainers:
                # if arglist.lstm:
                #     agent.preupdate(inds=inds)
                # else:
                agent.preupdate(inds)
            for agent in trainers:
                loss = agent.update(trainers, train_step)
                if loss is None: continue

            # for displaying learned policies
            if arglist.display:
                env.render()
                # continue

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                # U.save_state(arglist.save_dir, saver=saver)
                if arglist.tracking:
                    for agent in trainers:
                        agent.tracker.save()

                rew_file_name = "rewards/" + arglist.commit_num + "_rewards.pkl"
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = "rewards/" + arglist.commit_num + "_agrewards.pkl"
                # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
Example #17
0
def train(arglist):
    with U.single_threaded_session():
        # [Initialization]
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize (Tensorflow initialization procedure)
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        # Parameters initialization
        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]  # Get Action from Policy training.
            # environment step according to actions
            new_obs_n, rew_n, done_n, info_n = env.step(
                action_n
            )  # Receive the observation, the reward, the done and the information from the simulation environment.
            episode_step += 1
            done = all(done_n)  # Check if all tasks have been done.
            terminal = (episode_step >= arglist.max_episode_len
                        )  # Check the timeout.
            # record experience to agents
            for i, agent in enumerate(
                    trainers
            ):  # The "done" may be the actions which has been executed at the past.
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i],
                                 terminal)  # Record for the experience replay.
            obs_n = new_obs_n  # Reset the current observation

            for i, rew in enumerate(
                    rew_n
            ):  # Update the total rewards and each agent's rewards
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:  # Task finished or timeout, restart the simulation environment again.
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:  # Save the agents' information.
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)  # Delay.
                env.render()  # Displaying the environment if necessary.
                continue

            # update all trainers, if not in display or benchmark mode [Important]
            loss = None
            for agent in trainers:
                agent.preupdate(
                )  # Clear the index randomly choosed by method 'make_index' --> 'agent.replay_sample_index = None'
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #18
0
def train(arglist):

    ############################################

    @marlo.threaded
    def funcion(env, action, agent_num):

        contador = 0
        while True:  # Ejecutamos la accion evitando errores
            _, r, done, info, new_obs = env.step(np.argmax(action) + 1)
            new_obs = new_obs['observation']
            if new_obs == None:
                new_obs = last_obs[agent_num]
            else:
                new_obs = [
                    new_obs.get('XPos'),
                    new_obs.get('ZPos'),
                    new_obs.get('Yaw')
                ]
            contador += 1
            if r != 0:
                break
            elif info != None:
                if "caught_the_Chicken" in info:
                    r += 1
                    print("SE HA HARCODEADO LA PUNTUACION ", done, " ", info)
                    break

                if "Agent0_defaulted" in info:
                    r = -0.02
                    break

                if "Agent1_defaulted" in info:
                    r = -0.02
                    break

            elif contador >= 100:
                print("SE HA TARDADO MUCHO EN REALIZAR LA ACCION")
                break
        return new_obs, r, done, info


#######################################################

    with U.single_threaded_session():

        # Create environment

        client_pool = [('127.0.0.1', 10000), ('127.0.0.1', 10001)]
        join_tokens = marlo.make(
            "MarLo-MobchaseTrain1-v0",
            params=dict(client_pool=client_pool,
                        agent_names=["MarLo-Agent-0", "MarLo-Agent-1"],
                        videoResolution=[64, 64],
                        kill_clients_after_num_rounds=500,
                        forceWorldReset=False,
                        max_retries=500,
                        retry_sleep=0.1,
                        step_sleep=0.1,
                        prioritise_offscreen_rendering=False,
                        suppress_info=False))
        assert len(join_tokens) == 2

        # Create agent trainers
        #obs_shape_n = [(64,64,3,),(64,64,3,)]
        observation_space = [
            gym.spaces.Box(low=-np.inf,
                           high=+np.inf,
                           shape=(6, ),
                           dtype=np.float32),
            gym.spaces.Box(low=-np.inf,
                           high=+np.inf,
                           shape=(6, ),
                           dtype=np.float32)
        ]
        obs_shape_n = [observation_space[i].shape for i in range(2)]
        action_space = [gym.spaces.Discrete(4), gym.spaces.Discrete(4)]
        num_adversaries = 0
        trainers = get_trainers(num_adversaries, obs_shape_n, action_space,
                                arglist)

        # Initialize
        U.initialize()

        epis_trans = 0
        epsilon = 0.0

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.restore:
            print('Loading previous state...')
            resbuf = pickle.load(open("./saves/losbuffers.p", "rb"))
            epis_trans = resbuf[2]
            epsilon = resbuf[3]
            U.load_state(arglist.load_dir + str(epis_trans))
            trainers[0].replay_buffer = resbuf[0]
            trainers[1].replay_buffer = resbuf[1]

        episode_rewards = []
        agent_rewards = [
            [] for _ in range(2)
        ]  # lista de sumas de las recompensas de cada episodio
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        saver = tf.train.Saver()
        t_start = time.time()

        #inicial0 = [1.5, 2.5, 270, 5.5, 6.5, 180]
        #inicial1 = [5.5, 6.5, 180, 1.5, 2.5, 270]
        inicial0 = [1.5, 2.5, 270, 3.5, 4.5, 180]
        inicial1 = [3.5, 4.5, 180, 1.5, 2.5, 270]

        while True:
            #NEW

            last_obs = []

            agent_rewards[0].append(0)
            agent_rewards[1].append(0)

            env0 = marlo.init(join_tokens[0])
            env1 = marlo.init(join_tokens[1])

            # Run agent-0
            agent_thread_0, res0 = reiniciar(env0)
            # Run agent-1
            agent_thread_1, res1 = reiniciar(env1)

            obs0 = res0.get()
            obs1 = res1.get()

            obs0 = inicial0
            obs1 = inicial1

            done0 = False
            done1 = False

            num_eps = 0

            #Ejecutar 10 episodios
            while True:

                if (random() > epsilon):
                    action0 = trainers[0].action(np.array(
                        obs0))  # se obtine la accion que ejecuta la politica
                else:
                    action0 = np.random.dirichlet(np.ones(4), size=1)[0]

                if (random() > epsilon):
                    action1 = trainers[1].action(np.array(
                        obs0))  # se obtine la accion que ejecuta la politica
                else:
                    action1 = np.random.dirichlet(np.ones(4), size=1)[0]
                #print("Estan dentro")
                # Run agent-0
                agent_thread_0, resul0 = funcion(env0, action0, 0)
                # Run agent-1
                agent_thread_1, resul1 = funcion(env1, action1, 1)

                # Wait for both the threads to complete execution
                agent_thread_0.join()
                #print("Esta fuera 1")
                agent_thread_1.join()
                #print("Estan fuera")

                nob0, r0, done0, i0 = resul0.get()
                nob1, r1, done1, i1 = resul1.get()

                last_obs = [copy.deepcopy(nob0), copy.deepcopy(nob1)]

                # Las nuevas observciones
                varhelp = copy.deepcopy(nob0)
                nob0.extend(nob1)
                nob1.extend(varhelp)

                #print("ESTAS SON LAS OBSERVACIONES")
                #print(nob0)
                #print(nob1)

                trainers[0].experience(np.array(obs0), action0, r0,
                                       np.array(nob0), done0, False)
                trainers[1].experience(np.array(obs1), action1, r1,
                                       np.array(nob1), done1, False)

                agent_rewards[0][-1] += r0
                agent_rewards[1][-1] += r1

                obs0 = nob0
                obs1 = nob1

                if done0 or done1:
                    print("EPISODIO NUMERO:", num_eps)
                    # Run agent-0
                    agent_thread_0, res0 = reiniciar(env0)
                    # Run agent-1
                    agent_thread_1, res1 = reiniciar(env1)

                    obs0 = res0.get()
                    obs1 = res1.get()
                    obs0 = inicial0
                    obs1 = inicial1
                    done0 = False
                    done1 = False
                    num_eps += 1

                    loss = None
                    for agent in trainers:
                        agent.preupdate()
                    for agent in trainers:
                        loss = agent.update(trainers)
                        print("LA LOSS", loss)

                    if num_eps % epi_per_iter == 0:
                        break
                    agent_rewards[0].append(0)
                    agent_rewards[1].append(0)

            #Fin de ejecutar 10 episodios
            print("FIN DEL SAMPLE")

            # Se obtiene una lista de tuplas que contienen las rewards de los agentes emparejadas por episodios utilizadno los ultimos episodios generados en la iteracion
            # A estas tuplas se transforman a listas y se aplica sum()
            # El resultado de esto se coloca al final de episode_rewards
            #
            # En resumen: se suman las ultimas rewards de los agentes por episodios y se añaden a la lista
            episode_rewards.extend(
                list(
                    map(
                        sumtuple,
                        list(
                            zip(agent_rewards[0][epis_trans:],
                                agent_rewards[1][epis_trans:])))))

            epis_trans += 10
            if epsilon > 0.1:
                epsilon -= 0.002

            print("TOTAL DE EPISODIOS TRANSCURRIDOS: ", epis_trans,
                  " Epsilon: ", epsilon)

            # update all trainers, if not in display or benchmark mode

            # save model, display training output
            if (epis_trans % arglist.save_rate == 0):
                U.save_state(arglist.save_dir + str(epis_trans), saver=saver)
                losbuffers = [
                    trainers[0].replay_buffer, trainers[1].replay_buffer,
                    epis_trans, epsilon
                ]
                pickle.dump(
                    losbuffers,
                    open("./saves/losbuffers" + str(epis_trans) + ".p", "wb"))
                pickle.dump(losbuffers, open("./saves/losbuffers.p", "wb"))
            if (epis_trans % 1000 == 0):
                break
Example #19
0
def train(arglist):
    """
    Run MADDPG algorithm using passed in commandline arguments

    Args:
        arglist (argparse.Namespace): Parsed commandline arguments object
    """
    tf.reset_default_graph()

    if arglist.seed is not None:
        np.random.seed(arglist.seed)
        tf.set_random_seed(arglist.seed)

    with tf_util.make_session(config=None,
                              num_cpu=1,
                              make_default=False,
                              graph=None):
        # with tf_util.single_threaded_session():
        ###########################################
        #         Create environment              #
        ###########################################
        env = make_env(arglist.scenario,
                       arglist=arglist,
                       done=arglist.done_callback,
                       logging=arglist.logging,
                       benchmark=arglist.benchmark)

        ###########################################
        #        Create agent trainers            #
        ###########################################
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)

        print("Number of Adversaries: {}".format(num_adversaries))
        print('Experiment: {}. Using good policy {} and adv policy {}'.format(
            arglist.exp_name, arglist.good_policy, arglist.adv_policy))

        ###########################################
        #              Initialize                 #
        ###########################################
        tf_util.initialize()

        ###########################################
        #   Load previous results, if necessary   #
        ###########################################
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir

        # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None:
        if arglist.restore or arglist.benchmark or arglist.load_dir is not None:
            print('Loading previous state...')

            # Set model file
            if arglist.model_file == "":
                arglist.model_file = arglist.exp_name

            print("Model File: " + arglist.load_dir + arglist.model_file)
            tf_util.load_state(arglist.load_dir + arglist.model_file)

        ###########################################
        #       Create the save directory         #
        ###########################################
        if not os.path.exists(arglist.save_dir):
            os.makedirs(arglist.save_dir, exist_ok=True)

        if not os.path.exists(arglist.plots_dir):
            os.makedirs(arglist.plots_dir, exist_ok=True)

        ###########################################
        #             Set parameters              #
        ###########################################
        # Sum of rewards for all agents
        episode_rewards = [0.0]

        # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents
        # Individual agent reward
        # agent_rewards = [[0.0] for _ in range(env.n)]
        agent_rewards = [[0.0] for _ in range(len(env.world.agents))]

        # Retrieve previous episode count
        try:
            prev_ep_ct = int(arglist.model_file.split("_")[-1])
        except ValueError:
            print("Starting from untrained network...")
            prev_ep_ct = 0
        ep_ct = prev_ep_ct + arglist.num_episodes

        # Sum of rewards for training curve
        final_ep_rewards = []

        # Agent rewards for training curve
        final_ep_ag_rewards = []

        # Placeholder for benchmarking info
        agent_info = [[[]]]

        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        progress = False

        # Save more often if you have fewer episodes
        arglist.save_rate = min(arglist.save_rate, arglist.num_episodes)

        # Initialize loss file for each agent
        if arglist.log_loss:
            for i in range(len(env.world.agents)):
                log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True)

        ###########################################
        #                 Start                   #
        ###########################################
        print('Starting iterations...')
        while True:
            # TODO: Switch to is isinstance()
            # if type(env.world.scripted_agents[0].action) == type(None):
            #     print("Error")

            # Get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]

            # Environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            # Logging step
            if arglist.logging:
                env.log(
                    len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n,
                    rew_n, done_n, info_n)

            # Update information
            episode_step += 1

            # Check if all agents are done
            # done = all(done_n)

            # Check if any agents are done
            done = any(done_n)

            terminal = (episode_step >= arglist.max_episode_len)

            # Collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            # For displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                if done or terminal:
                    print('Episode Reward: {}'.format(
                        [rew[-1] for rew in agent_rewards]))
                    time.sleep(0.5)
                    obs_n = env.reset()
                    episode_step = 0
                    episode_rewards.append(0)
                    for a in agent_rewards:
                        a.append(0)
                    agent_info.append([[]])
                continue

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # Increment global step counter
            train_step += 1

            # For benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])

                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # In testing mode, don't perform model updates
            if arglist.testing:
                if len(episode_rewards) > arglist.num_episodes:
                    print("episodes: {}, "
                          "mean episode reward: {}, time: {}".format(
                              len(episode_rewards),
                              np.mean(episode_rewards[-arglist.save_rate:]),
                              round(time.time() - t_start, 3)))
                    env.logger.save("State",
                                    arglist.save_dir,
                                    filename=arglist.exp_name + '_state' +
                                    '_' + str(prev_ep_ct) + arglist.log_append)
                    break
                continue

            # Update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for i, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if arglist.log_loss and loss is not None:
                    log_loss(arglist,
                             ep_ct,
                             "agent_{}".format(i),
                             loss=loss[1])

            if len(episode_rewards) % 100 == 0 and progress:
                print("Episode {} Reached. Time: {}".format(
                    len(episode_rewards),
                    time.time() - t_start))
                progress = False
            elif len(episode_rewards) % 100 != 0 and not progress:
                progress = True

            # Save model, display training output
            if (terminal or done) and (len(episode_rewards) % arglist.save_rate
                                       == 0):
                # TODO: Implement some checks so that we don't overwrite old networks unintentionally?

                # Save model state
                tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' +
                                   str(len(episode_rewards) + prev_ep_ct),
                                   saver=saver)

                # Print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) + prev_ep_ct,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) + prev_ep_ct,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(reward[-arglist.save_rate:])
                                    for reward in agent_rewards
                                ], round(time.time() - t_start, 3)))

                # Reset start time to current time
                t_start = time.time()

                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for reward in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(reward[-arglist.save_rate:]))

            # Saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)

                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)

                # Log agent data for run
                env.logger.save("State",
                                arglist.save_dir,
                                filename=arglist.exp_name + '_state' + '_' +
                                str(len(episode_rewards) + prev_ep_ct))

                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #20
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = StarCraft2Env(map_name=arglist.scenario,
                            reward_only_positive=False,
                            obs_last_action=True,
                            obs_timestep_number=True,
                            reward_scale_rate=200)
        # Create agent trainers
        env_info = env.get_env_info()
        num_agents = env_info["n_agents"]
        num_adversaries = num_agents
        obs_shape_n = [(env_info["obs_shape"], )
                       for i in range(num_adversaries)]
        action_space_n = [
            env_info["n_actions"] for i in range(num_adversaries)
        ]
        buffer_size = arglist.buffer_size

        trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n,
                                arglist, buffer_size)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        logdir = "./tensorboard/"

        Logger.DEFAULT \
            = Logger.CURRENT \
            = Logger(dir=None,
                     output_formats=[TensorBoardOutputFormat(logdir)])

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(num_agents)]  # individual agent reward
        saver = tf.train.Saver(max_to_keep=100000000)
        n_actions_no_attack = 6

        env.reset()

        obs_n = []
        reward_hl_own_old = []
        reward_hl_en_old = []
        for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
            obs = env.get_obs_agent(agent_id)
            obs_n.append(obs)
            reward_hl_own_old.append(env.get_agent_health(agent_id))
            reward_hl_en_old.append(env.get_enemy_health(agent_id))

        episode_step = 0
        step = 0

        print('Starting iterations...')
        while True:
            # get action
            action_set_actual = []
            action_set_execute = []
            action_n = []
            dead_unit = []
            for agent_id in range(num_agents):
                action_output = trainers[agent_id].action(obs_n[agent_id])
                action_n.append(action_output)
                action_prob = action_output
                action_to_choose = np.argmax(action_prob)
                action_set_actual.append(action_to_choose)
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                if action_to_choose in avail_actions_ind:
                    action_set_execute.append(action_to_choose)
                elif (avail_actions[0] == 1):
                    action_set_execute.append(
                        0)  # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作
                else:
                    action_set_execute.append(1)  # 如果该动作不能执行,那么就用STOP动作代替

                if (len(avail_actions_ind) == 1
                        and avail_actions_ind[0] == 0):  # 判断该智能体是否已经死亡
                    dead_unit.append(agent_id)

            rew_base, done, _ = env.step(action_set_execute)
            episode_rewards[-1] += rew_base
            new_obs_n = []
            reward_hl_own_new = []
            reward_hl_en_new = []
            rew_n = []

            for agent_id in range(num_agents):
                obs_next = env.get_obs_agent(agent_id=agent_id)
                new_obs_n.append(obs_next)
                reward_hl_own_new.append(env.get_agent_health(agent_id))
                reward_hl_en_new.append(env.get_enemy_health(agent_id))

            for agent_id in range(num_agents):
                if (agent_id in dead_unit):
                    reward = 0
                elif (action_set_execute[agent_id] !=
                      action_set_actual[agent_id]
                      ):  #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励
                    reward = -2

                elif (action_set_execute[agent_id] > 5):
                    target_id = action_set_execute[
                        agent_id] - n_actions_no_attack
                    health_reduce_en = reward_hl_en_old[
                        target_id] - reward_hl_en_new[target_id]
                    if (health_reduce_en > 0):
                        if (rew_base > 0):
                            reward = 2 + rew_base
                        else:
                            reward = 2
                    else:
                        reward = 1
                else:
                    reward = (reward_hl_own_new[agent_id] -
                              reward_hl_own_old[agent_id]) * 5
                rew_n.append(reward)

            episode_step += 1

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done)

            obs_n = new_obs_n
            reward_hl_own_old = reward_hl_own_new
            reward_hl_en_old = reward_hl_en_new

            for i, rew in enumerate(rew_n):
                agent_rewards[i][-1] += rew

            if done:
                print("steps until now : %s, episode: %s, episode reward: %s" %
                      (step, len(episode_rewards), episode_rewards[-1]))
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("episode reward", episode_rewards[-1])
                for i in range(num_agents):
                    logger.record_tabular("agent" + str(i) + " episode reward",
                                          agent_rewards[i][-1])
                logger.dump_tabular()

                env.reset()
                obs_n = []
                reward_hl_own_old = []
                reward_hl_en_old = []
                for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
                    obs = env.get_obs_agent(agent_id)
                    obs_n.append(obs)
                    reward_hl_own_old.append(env.get_agent_health(agent_id))
                    reward_hl_en_old.append(env.get_enemy_health(agent_id))
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)

            # increment global step counter
            step += 1
            if (step == arglist.buffer_size):
                print("Training starts.")

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, step)

            # save model, display training output
            if done and (len(episode_rewards) % arglist.save_rate == 0):
                save_dir = arglist.save_dir + "/model_" + str(
                    step) + "steps/" + arglist.exp_name
                U.save_state(save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}".
                          format(step, len(episode_rewards),
                                 np.mean(
                                     episode_rewards[-arglist.save_rate:])))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}"
                        .format(step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards) - 1))
                break
Example #21
0
def train(arglist):
    with U.make_session(8):
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [[29] for i in range(env.n)]
        obs_map_shape_n =[[56*86] for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env._reset()
        episode_step = 13000
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            #action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
            action_n=[]
            for agent, obs in zip(trainers,obs_n):
                #print(obs)
                t=agent.action(obs)
                d=np.argmax(t)
                if d%5==4:
                    rt=random.randint(0,20)
                    if rt<4:
                        swap=t[d]
                        t[d]=t[d-rt-1]
                        t[d-rt-1]=swap
                else:
                    rt=random.randint(0,80)
                    if rt<4:
                        swap=t[d]
                        t[d]=t[d//5*5+rt]
                        t[d//5*5+rt]=swap

                action_n.append(t)

            #print(action_n)
            # environment step
            new_obs_n, rew_n, done_n, info_n = env._step(action_n)
            
            #print(rew_n)
            

            episode_step += 1
            env.training_episode=episode_step

            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env._reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.online_display or arglist.display:
                 time.sleep(0.01)
                 #if rew_n[2]>0: pdb.set_trace()
                 env._render(close=False)
                 print(rew_n)
                 # if (rew_n[2]>0) or (rew_n[0]>0) or (rew_n[1]>0):
                 #     pdb.set_trace() 
                 #pdb.set_trace()
                 
                 if arglist.display: continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()

                

                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
Example #22
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))
        print("number of adversaries is: ", num_adversaries)
        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            print("path is: ", arglist.load_dir)
            print("restoring checkpoints")
            # added for selective training.
            # Make it general for other environments as well later.
            if arglist.scenario == "simple_tag":
                print("inside simple tag")
                if not arglist.train_adversaries:
                    print("loading only positive")
                    print("number of adversaries are: ", num_adversaries)
                    saver = tf.train.Saver(var_list=tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope="agent_" + str(num_adversaries)))
                    print(
                        "var list is: ",
                        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope="agent_" +
                                          str(num_adversaries)))
                if not arglist.train_positive_agent:
                    print("only loading adversaries")
                    var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope="agent_0")
                    print("var list is: ", var_list)
                    for l in range(1, arglist.num_adversaries):
                        var_list += tf.get_collection(
                            tf.GraphKeys.GLOBAL_VARIABLES,
                            scope="agent_" + str(l))

                    saver = tf.train.Saver(var_list=var_list)

                U.load_state(arglist.load_dir, saver=saver)

            else:
                U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward

        if arglist.restore:
            final_ep_rewards = list(
                np.load(arglist.plots_dir + arglist.exp_name +
                        '_episode_rewards.npy'))
            final_ep_ag_rewards = list(
                np.load(arglist.plots_dir + arglist.exp_name +
                        '_agent_rewards.npy'))
            final_ep_ag_rewards = [list(a) for a in final_ep_ag_rewards]
        else:
            final_ep_rewards = []  # sum of rewards for training curve
            # final_ep_ag_rewards = []  # agent rewards for training curve
            final_ep_ag_rewards = [[0.0] for _ in range(env.n)
                                   ]  # agent rewards for training curve

        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print("number of agents in the environment are: ", env.n)
        episode_avg_rewards = [0.0]
        agent_avg_rewards = [[0.0] for _ in range(env.n)]
        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0

                # this should perhaps be done later.
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            # for agent in trainers:
            #     agent.preupdate()
            # for agent in trainers:
            #     loss = agent.update(trainers, train_step)

            for m in range(0, len(trainers)):
                agent = trainers[m]

                if not arglist.train_adversaries and m > num_adversaries:
                    # print("updating positive")
                    agent.preupdate()

                if not arglist.train_positive_agent and m <= num_adversaries:
                    # print("updating adversary")
                    agent.preupdate()

                if arglist.train_positive_agent and arglist.train_adversaries:
                    # print("updating both")
                    agent.preupdate()

            for m in range(0, len(trainers)):
                agent = trainers[m]

                if not arglist.train_adversaries and m > num_adversaries:
                    loss = agent.update(trainers, train_step)

                if not arglist.train_positive_agent and m <= num_adversaries:
                    loss = agent.update(trainers, train_step)

                if arglist.train_positive_agent and arglist.train_adversaries:
                    loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:-1]))
                # for rew in agent_rewards:
                #     final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

                # for rew in agent_rewards:
                for j in range(len(agent_rewards)):
                    rew = agent_rewards[j]
                    final_ep_ag_rewards[j].append(
                        np.mean(rew[-arglist.save_rate:-1]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))

                agent_rewards = np.array(final_ep_ag_rewards)
                episode_rewards = np.array(final_ep_rewards)

                np.save(
                    arglist.plots_dir + arglist.exp_name +
                    '_agent_rewards.npy', agent_rewards)
                np.save(
                    arglist.plots_dir + arglist.exp_name +
                    '_episode_rewards.npy', episode_rewards)

                fig, ax = plt.subplots()
                for k in range(len(agent_rewards)):
                    ax.plot(agent_rewards[k], label="agent_" + str(k))

                ax.plot(episode_rewards, label="total")

                ax.legend()
                plt.savefig(arglist.plots_dir + arglist.exp_name + '_plot.png')
                plt.show()

                break
Example #23
0
    def on_event(self, f):
        @inlineCallbacks
        def set_wheel(self, robot_wheels):
            yield self.call(u'aiwc.set_speed', args.key, robot_wheels)
            return

        # initiate empty frame
        received_frame = Frame()

        if 'time' in f:
            received_frame.time = f['time']
        if 'score' in f:
            received_frame.score = f['score']
        if 'reset_reason' in f:
            received_frame.reset_reason = f['reset_reason']
        if 'coordinates' in f:
            received_frame.coordinates = f['coordinates']
        if 'EOF' in f:
            self.end_of_frame = f['EOF']

        #self.printConsole(received_frame.time)
        #self.printConsole(received_frame.score)
        #self.printConsole(received_frame.reset_reason)
        #self.printConsole(self.end_of_frame)
##############################################################################
        if (self.end_of_frame):

            # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4)
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE])
            #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH])
            #self.printConsole(received_frame.coordinates[BALL][X])
            #self.printConsole(received_frame.coordinates[BALL][Y])

            self.get_coord(received_frame)
            ##############################################################################
            # Next state, Reward, Reset
            # new_obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(self.number_of_robots)]
            new_obs_n = []
            rew_n = []
            done_n = []
            for i in range(self.number_of_robots):
                next_state = self.pre_processing(i)
                # self.printConsole(next_state)
                # new_obs_n[i] = np.append(next_state, next_state - self.obs_n[i][:-self.state_dim]) # position and velocity
                new_obs_n.append(
                    np.append(next_state,
                              next_state - self.obs_n[i][:-self.state_dim])
                )  # position and velocity
                # self.printConsole('observation ' + str(i) + ': '+ str(new_obs_n[i]))

                rew_n.append(self.get_reward(received_frame.reset_reason, i))

                if (received_frame.reset_reason != NONE):
                    done_n.append(True)
                else:
                    done_n.append(False)
            done = all(done_n)
            if done:
                self.printConsole("reset reason: " +
                                  str(received_frame.reset_reason))

            # self.printConsole('reward: ' + str(rew_n[0]))
            # rew_n = [sum(rew_n) for i in range(self.number_of_robots)]

            # for i, agent in enumerate(self.trainers):
            #     agent.experience(self.obs_n[i], self.action_n[i], rew_n[i], new_obs_n[i], done_n[i], False)
            for i in range(self.number_of_robots):
                if not self.cur_my_posture[i][ACTIVE]:
                    self.printConsole('robot ' + str(i) + ' is not active')
                    continue
                self.trainers[0].experience(self.obs_n[i], self.action_n[i],
                                            rew_n[i], new_obs_n[i], done_n[i],
                                            False)

            self.obs_n = new_obs_n

            # for i, rew in enumerate(rew_n):
            #     self.episode_rewards[-1] += rew
            #     self.agent_rewards[i][-1] += rew

            # if done:
            #     self.episode_rewards.append(0)
            #     for a in self.agent_rewards:
            #         a.append(0)
            #     self.agent_info.append([[]])
            self.reward_sum += rew_n

            # increment global step counter
            self.train_step += 1

            # update all trainers
            loss = None
            for agent in self.trainers:
                agent.preupdate()
            for agent in self.trainers:
                loss = agent.update(self.trainers, self.train_step)

            # get action
            # self.action_n = [agent.action(obs) for agent, obs in zip(self.trainers,self.obs_n)]
            self.action_n = [
                self.trainers[0].action(obs) for obs in self.obs_n
            ]
            # self.printConsole("original action: " + str(self.action_n[0]))

            for i in range(self.number_of_robots):
                self.wheels[2 * i] = self.max_linear_velocity * (
                    self.action_n[i][1] - self.action_n[i][2] +
                    self.action_n[i][3] - self.action_n[i][4])
                self.wheels[2 * i + 1] = self.max_linear_velocity * (
                    self.action_n[i][1] - self.action_n[i][2] -
                    self.action_n[i][3] + self.action_n[i][4])

            # self.printConsole("                 action: " + str(self.wheels[:2]))
            self.printConsole('step: ' + str(self.train_step))

            self.pre_ball = self.cur_ball
            set_wheel(self, self.wheels.tolist())
            ##############################################################################
            if (self.train_step % self.save_every_steps) == 0:
                U.save_state(self.arglist.save_dir, saver=self.saver)

            # if done: # plot the statics
            if (self.train_step % self.stats_steps
                ) == 0:  # plot every 6000 steps (about 5 minuites)
                self.printConsole("add data to tensorboard")
                stats = [sum(self.reward_sum)] + [
                    self.reward_sum[i] for i in range(len(self.reward_sum))
                ] + [self.score_sum]
                for i in range(len(stats)):
                    U.get_session().run(self.update_ops[i],
                                        feed_dict={
                                            self.summary_placeholders[i]:
                                            float(stats[i])
                                        })
                summary_str = U.get_session().run(self.summary_op)
                self.summary_writer.add_summary(summary_str, self.inner_step)

                self.reward_sum = np.zeros(len(self.reward_sum))
                self.score_sum = 0
                self.inner_step += 1
##############################################################################
            if (received_frame.reset_reason == GAME_END):
                #(virtual finish() in random_walk.cpp)
                #save your data
                with open(args.datapath + '/result.txt', 'w') as output:
                    #output.write('yourvariables')
                    output.close()
                #unsubscribe; reset or leave
                yield self.sub.unsubscribe()
                try:
                    yield self.leave()
                except Exception as e:
                    self.printConsole("Error: {}".format(e))

            self.end_of_frame = False
Example #24
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        state_shape_n = [(64, ) for i in range(env.n)]
        trainers = get_trainers(env, num_adversaries, obs_shape_n,
                                state_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()
        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        episode_begin_num = 0

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
            fname = './learning_curves/' + arglist.exp_name + '_rewards.pkl'
            final_ep_rewards = pickle.load(open(fname, 'rb'))
            fname = './learning_curves/' + arglist.exp_name + '_agrewards.pkl'
            final_ep_ag_rewards = pickle.load(open(fname, 'rb'))
            episode_begin_num = arglist.save_rate * len(final_ep_rewards)

        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()

        obs_n = env.reset()
        state_n = [agent.p_init_state(1) for agent in trainers]
        pred_n = [agent.init_pred(1) for agent in trainers]

        episode_step = 0
        train_step = 0
        t_start = time.time()

        print('Starting iterations...')
        while True:
            ## get action
            temp = [
                agent.take_action(obs, state,
                                  pred) for agent, obs, state, pred in zip(
                                      trainers, obs_n, state_n, pred_n)
            ]
            action_n = [x[0] for x in temp]
            new_state_n = [x[1] for x in temp]
            gru_out_n = [x[2] for x in temp]
            new_pred_n = [
                agent.predict(act[None], gru_out)
                for agent, act, gru_out in zip(trainers, action_n, gru_out_n)
            ]

            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)

            # collect experience
            ## need to be modified
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n
            state_n = new_state_n
            # pred_n = [x.eval() for x in new_pred_n]
            pred_n = new_pred_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                state_n = [agent.p_init_state(1) for agent in trainers]
                pred_n = [agent.init_pred(1) for agent in trainers]
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.05)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step, arglist.step_size,
                                    arglist.burn_in_step)

            # save model, display training output
            episode_num = len(episode_rewards) + episode_begin_num
            if terminal and (episode_num % arglist.save_rate == 0):
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, episode_num,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, episode_num,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(episode_num))

                U.save_state(arglist.save_dir, saver=saver)

            if episode_num > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
def train(arglist, PID=None, lock=None):
    start_time = time.time()
    # global replay_buffer
    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agents networks
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        ####changed by yuan li
        num_adversaries = copy.deepcopy(env.num_adversaries)
        arglist.num_adversaries = copy.deepcopy(num_adversaries)

        if comm_rank != 0 and comm_rank != 1:
            req = None
            wait_flag = False

            actors = get_agents(env, num_adversaries, obs_shape_n, arglist)

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]
            #加载模型
            var_list_n = []
            for actor in actors:
                var_list_n.extend(actor.get_variable_list())
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables(
                env)
            obs_n = env.reset()
            step = 0
            episode_step = 0
            sample_number = 0
            t_start = time.time()
            updata_time = 0
            print('Starting iterations...')

            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0

            while True:
                if not wait_flag:
                    #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11)
                    req = comm.irecv(350000, source=0, tag=11)
                    wait_flag = True
                else:
                    data_recv = req.test()
                    if data_recv[0]:
                        wait_flag = False
                        if data_recv[1] == 'finish':
                            #finish = True
                            comm.send('finish', dest=1, tag=11)
                            break
                        else:
                            update_start = time.time()
                            i = 0
                            j = 0
                            for var in tf.trainable_variables():
                                if 11 < (i % 24) < 24:
                                    var.load(data_recv[1][j], sess)
                                    j += 1
                                i += 1

                            #for var in var_list:
                            #    var.load(data_recv[1][i], sess)
                            #    i += 1
                            #print("111111111111111111111111,load param")
                            #for i, actor in enumerate(actors):
                            #    actor.load_weights(data_recv[1][i], sess)
                            update_end = time.time()
                            #print("step:{}, rank0_update_end_time:{}".format(step, update_end))
                            updata_time += (update_end - update_start)
                            step += 1
                    else:
                        wait_flag = True
                        # get action
                        action_n = [
                            agent.action(obs)
                            for agent, obs in zip(actors, obs_n)
                        ]
                        # environment step
                        new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                        episode_step += 1
                        # changed by liyuan
                        done = any(done_n)
                        terminal = (episode_step >= arglist.max_episode_len)
                        ###liyuan: compute the arverage win rate
                        if green_leave_screen(env) or adversary_all_die(
                                env) or adversary_leave_screen(env):
                            terminal = True

                        if adversary_all_die(env):
                            green_win += 1
                        if green_leave_screen(env):
                            invalid_train += 1
                            green_leave += 1
                        if adversary_leave_screen(env):
                            red_leave += 1

                        if episode_step >= arglist.max_episode_len:
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 50

                        if adversary_all_die(env):
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 100

                        if done:
                            red_win = red_win + 1
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] += 200
                                    rew_n[i] += (
                                        arglist.max_episode_len -
                                        episode_step) / arglist.max_episode_len

                        #send data
                        data = [obs_n, action_n, rew_n, new_obs_n, done_n]
                        comm.send(data, dest=1, tag=11)

                        sample_number += 1

                        #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n)
                        obs_n = new_obs_n
                        for i, rew in enumerate(rew_n):
                            episode_rewards[-1] += rew
                            agent_rewards[i][-1] += rew

                        if done or terminal:
                            obs_n = env.reset()
                            episode_step = 0
                            episode_rewards.append(0)
                            for a in agent_rewards:
                                a.append(0)
                            agent_info.append([[]])

                        # save model, display training output
                        if (terminal or done) and (len(episode_rewards) %
                                                   arglist.save_rate == 0):
                            if red_win >= 0.8 * arglist.save_rate:
                                temp_dir = arglist.save_dir + "_" + str(
                                    len(episode_rewards)) + "_" + str(
                                        red_win) + "_{}".format(PID)
                                U.save_state(temp_dir, saver=saver)
                            # print statement depends on whether or not there are adversaries
                            if num_adversaries == 0:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        round(time.time() - t_start, 3)))
                            else:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        [
                                            np.mean(rew[-arglist.save_rate:])
                                            for rew in agent_rewards
                                        ], round(time.time() - t_start, 3)))
                                print(
                                    "Rank  {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}"
                                    .format(comm_rank, red_win, green_win,
                                            red_leave, green_leave))

                                middle_time = time.time()
                                print(
                                    "sample_number:{}, train_step:{}, update_time:{}, total_time:{}"
                                    .format(sample_number, step, updata_time,
                                            middle_time - start_time))
                                mydata = []
                                mydata.append(str(len(episode_rewards)))
                                mydata.append(
                                    str(
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[0]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[1]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[2]
                                                [-arglist.save_rate:])))
                                mydata.append(str(red_win))
                                mydata.append(
                                    str(round(time.time() - t_start, 3)))
                                out = open('1mydata_{}.csv'.format(comm_rank),
                                           'a',
                                           newline='')
                                csv_write = csv.writer(out, dialect='excel')
                                csv_write.writerow(mydata)

                            if len(episode_rewards) > 3000:
                                U.save_state(arglist.save_dir, saver=saver)

                            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0
                            t_start = time.time()
                            # Keep track of final episode reward
                            final_ep_rewards.append(
                                np.mean(episode_rewards[-arglist.save_rate:]))
                            for rew in agent_rewards:
                                final_ep_ag_rewards.append(
                                    np.mean(rew[-arglist.save_rate:]))

            end_time = time.time()
            print("rank{}_time:{}".format(comm_rank, end_time - start_time))
            print("rank{}_update_time:{}".format(comm_rank, updata_time))
            print("rank{}_step:{}".format(comm_rank, step))

        if comm_rank == 1:
            replay_buffer = ReplayBuffer(1e6)

            wait_flag_1 = False
            wait_flag_2 = False
            wait_flag_3 = False
            req1 = None
            req2 = None
            req3 = None
            sample = 0
            step = 0
            req_list = []
            while True:
                if not wait_flag_1 or not wait_flag_2 or not wait_flag_3:
                    if not wait_flag_1:
                        req1 = comm.irecv(source=2, tag=11)
                        wait_flag_1 = True
                    if not wait_flag_2:
                        req2 = comm.irecv(source=3, tag=11)
                        wait_flag_2 = True
                    if not wait_flag_3:
                        req3 = comm.irecv(source=4, tag=11)
                        wait_flag_3 = True
                else:
                    data_recv_1 = req1.test()
                    data_recv_2 = req2.test()
                    data_recv_3 = req3.test()
                    if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]:
                        if data_recv_1[0]:
                            wait_flag_1 = False
                            if data_recv_1[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_2[0]:
                            wait_flag_2 = False
                            if data_recv_2[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_3[0]:
                            wait_flag_3 = False
                            if data_recv_3[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1
                        '''
                        #计算接收100个样本然后发送样本用的时间
                        if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len:
                            start = time.time()
                            replay_sample_index = replay_buffer.make_index(arglist.batch_size)
                            send_data = replay_buffer.sample_index(replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11)
                            sample = 0
                            step += 1
                            end = time.time()
                            print("rank1 send sample time:", end-start)
                        '''

                    else:
                        wait_flag_1 = True
                        wait_flag_2 = True
                        wait_flag_3 = True
                        if (sample // 100 > 0) and len(
                                replay_buffer
                        ) >= arglist.batch_size * arglist.max_episode_len:
                            replay_sample_index = replay_buffer.make_index(
                                arglist.batch_size)
                            send_data = replay_buffer.sample_index(
                                replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=0, tag=11)
                            sample = 0
                            step += 1

            end_time = time.time()
            print("rank1_time:", end_time - start_time)
            print("rank1_step", step)

        if comm_rank == 0:
            extract_time = 0
            step = 0

            learners = get_agents(env, num_adversaries, obs_shape_n, arglist)

            var_list_n = []
            for learner in learners:
                var_list_n.extend(learner.get_variable_list())

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]

            # 加载模型
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            while True:
                if step >= STEP:
                    for i in range(comm_size - 2):
                        comm.send('finish', dest=(i + 2), tag=11)
                    break
                else:
                    start = time.time()
                    data_recv = comm.recv(source=1, tag=11)

                    for i, agent in enumerate(learners):
                        agent.update(learners, data_recv)

                    #dict_list = []
                    param = []
                    extract_start = time.time()
                    i = 0
                    for var in tf.trainable_variables():
                        if 11 < (i % 24) < 24:
                            param.append(sess.run(var))
                        i += 1
                    #print("2222222222222222 load weights")
                    #for var in var_list:
                    #   param.append(sess.run(var))

                    extract_end = time.time()
                    extract_time += (extract_end - extract_start)

                    for i in range(comm_size - 2):
                        comm.send(param, dest=(i + 2), tag=11)
                    #print("222222222222222222222222,send param")

                    step += 1
                    end = time.time()
                    #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start))
            end_time = time.time()
            print("rank0_time:", end_time - start_time)
            print("rank0_extract_time:", extract_time)
            print("rank0_step:", step)
Example #26
0
def train(arglist):
    # random.seed(arglist.random_seed)
    # np.random.seed(arglist.random_seed)
    # tf.set_random_seed(arglist.random_seed)

    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        savers = [
            tf.train.Saver(U.scope_vars(trainer.name)) for trainer in trainers
        ]

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            # U.load_state(arglist.load_dir)
            [
                U.load_state(os.path.join(arglist.load_dir,
                                          'team_{}'.format(i)),
                             saver=saver) for i, saver in enumerate(savers)
            ]

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        if arglist.trainer == 'tarmac' or arglist.trainer == 'reuse_tarmac' or arglist.trainer == 'ibmac_inter':
            message_n = np.zeros([len(obs_n), 4])
        is_training = True

        t_start = time.time()

        writer = tf.summary.FileWriter("graph", U.get_session().graph)
        writer.close()

        writer = SummaryWriter(arglist.save_dir)

        print('Starting iterations...')
        while True:
            # get action
            if arglist.trainer == 'ibmac' or arglist.trainer == 'reuse_ibmac':
                is_inference = False
                if arglist.display or arglist.restore or arglist.benchmark:
                    is_inference = False
                if len(trainers) == 2:
                    action_n1 = trainers[0].action(obs_n[:num_adversaries],
                                                   is_inference=is_inference)
                    action_n2 = trainers[1].action(obs_n[num_adversaries:],
                                                   is_inference=is_inference)
                    action_n = [action[0] for action in action_n1
                                ] + [action[0] for action in action_n2]
                else:
                    action_n = trainers[0].action(obs_n,
                                                  is_inference=is_inference)
                    action_n = [action[0] for action in action_n]
            elif arglist.trainer == 'ibmac_inter':
                if len(trainers) == 2:
                    action_n1, message_action_n1 = trainers[0].action(
                        obs_n[:num_adversaries], message_n[:num_adversaries])
                    action_n2, message_action_n2 = trainers[1].action(
                        obs_n[num_adversaries:], message_n[num_adversaries:])
                    action_n = [action[0] for action in action_n1
                                ] + [action[0] for action in action_n2]
                else:
                    action_n, message_action_n = trainers[0].action(
                        obs_n, message_n)
                    action_n = [action[0] for action in action_n]
                    message_n = [
                        message_action[0]
                        for message_action in message_action_n
                    ]
            else:
                action_n = [
                    agent.action(obs) for agent, obs in zip(trainers, obs_n)
                ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            if arglist.trainer == 'ibmac':
                if len(trainers) == 2:
                    trainers[0].experience(obs_n[:num_adversaries],
                                           action_n[:num_adversaries],
                                           rew_n[:num_adversaries],
                                           new_obs_n[:num_adversaries],
                                           done_n[:num_adversaries], terminal)
                    trainers[1].experience(obs_n[num_adversaries:],
                                           action_n[num_adversaries:],
                                           rew_n[num_adversaries:],
                                           new_obs_n[num_adversaries:],
                                           done_n[num_adversaries:], terminal)
                else:
                    trainers[0].experience(obs_n, action_n, rew_n, new_obs_n,
                                           done_n, terminal)
            elif arglist.trainer == 'ibmac_inter':
                if len(trainers) == 2:
                    trainers[0].experience(obs_n[:num_adversaries],
                                           message_n[:num_adversaries],
                                           action_n[:num_adversaries],
                                           rew_n[:num_adversaries],
                                           new_obs_n[:num_adversaries],
                                           done_n[:num_adversaries], terminal)
                    trainers[1].experience(obs_n[num_adversaries:],
                                           message_n[:num_adversaries],
                                           action_n[num_adversaries:],
                                           rew_n[num_adversaries:],
                                           new_obs_n[num_adversaries:],
                                           done_n[num_adversaries:], terminal)
                else:
                    trainers[0].experience(obs_n, message_n, action_n, rew_n,
                                           new_obs_n, done_n, terminal)
            else:
                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                     new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for i, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if loss:
                    if isinstance(agent, IBMACAgentTrainer) or isinstance(
                            agent, ReuseIBMACAgentTrainer):
                        q_loss, p_loss, _, _, _, _, kl_loss = loss
                        writer.add_scalar('agent_{}/loss_kl'.format(i),
                                          kl_loss, train_step)
                    else:
                        q_loss, p_loss, _, _, _, _ = loss
                    writer.add_scalar('agent_{}/loss_policy'.format(i), p_loss,
                                      train_step)
                    writer.add_scalar('agent_{}/loss_critic'.format(i), q_loss,
                                      train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                [
                    U.save_state(os.path.join(arglist.save_dir,
                                              'team_{}'.format(i)),
                                 saver=saver) for i, saver in enumerate(savers)
                ]
                # print statement depends on whether or not there are adversaries

                for i in range(len(agent_rewards)):
                    writer.add_scalar(
                        'agent_{}/mean_episode_reward'.format(i),
                        np.mean(agent_rewards[i][-arglist.save_rate:]),
                        len(episode_rewards))

                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #27
0
def train(arglist):
    with U.single_threaded_session():
        if not os.path.isdir(arglist.save_dir):
            os.makedirs(arglist.save_dir)
        if not os.path.isdir(arglist.benchmark_dir):
            os.makedirs(arglist.benchmark_dir)
        if not os.path.isdir(arglist.plots_dir):
            os.makedirs(arglist.plots_dir)

        #tensorboard
        summary_writer = tf.summary.FileWriter(
            "./" + arglist.exp_name + "_graph/",
            U.get_session().graph)
        reward_plot = None
        reward_summary = tf.Summary()
        reward_summary.value.add(tag='reward', simple_value=reward_plot)

        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        """
        #### USE RVO 
        """
        use_rvo_range = -1  # if want to use rvo, set 0.28

        t_start = time.time()

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]

            if use_rvo_range < 0:
                new_obs_n, rew_n, done_n, info_n = env.step(action_n,
                                                            use_rvo=None)
            else:
                # use_rvo list
                total_rvo_list = []
                for obs in obs_n:
                    agent_pos = obs[-2 * (env.world.num_agents - 1)::]
                    obst_pos = obs[-2 * (env.world.num_agents +
                                         env.world.num_obstacles)::]
                    agent_rvo_list = []
                    for i in range(0, len(agent_pos), 2):
                        if np.sqrt(np.sum(np.square(
                                agent_pos[i:i + 2]))) < use_rvo_range:
                            agent_rvo_list.append(True)
                        else:
                            agent_rvo_list.append(False)
                    for i in range(0, len(obst_pos), 2):
                        if np.sqrt(np.sum(np.square(
                                obst_pos[i:i + 2]))) < use_rvo_range:
                            agent_rvo_list.append(True)
                        else:
                            agent_rvo_list.append(False)

                    if any(agent_rvo_list):
                        total_rvo_list.append(True)
                    else:
                        total_rvo_list.append(False)
                # environment step
                new_obs_n, rew_n, done_n, info_n = env.step(
                    action_n, use_rvo=total_rvo_list)

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # add reward to tensorboard
            reward_summary.value[0].simple_value = np.mean(
                episode_rewards[-arglist.save_rate:])
            summary_writer.add_summary(reward_summary, len(episode_rewards))

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))

                t_start = time.time()
            if terminal:
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) % 1000 == 0:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' + str(
                    len(episode_rewards))
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('saved')
            if len(episode_rewards) > arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #28
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        obs_n = env.reset()
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))
        # Pretrain the safety_layer
        safety_layer = None
        if arglist.use_safety_layer:
            safety_layer = SafetyLayer(env,
                                       len(env.world.landmarks) - 1,
                                       mlp_model_safety_layer,
                                       env.observation_space[0].shape,
                                       env.action_space, trainers[0].action)
            # set safety_layer for trainer[0]
            trainers[0].set_safety_layer(safety_layer)
        if arglist.use_mpc_layer:
            safety_layer = MpcLayer(env)
            # set safety_layer for trainer[0]
            trainers[0].set_safety_layer(safety_layer)

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        episode_step = 0
        train_step = 0
        cumulative_constraint_violations = 0
        t_start = time.time()
        data_save = []
        num_done = 0

        # pickle env
        # env0 = copy.deepcopy(env)
        '''file_path = open('env.pkl', 'rb')
        import pickle
        for i in range(len(env.world.landmarks)):
            env.world.landmarks[i] = pickle.load(file_path)
        for i in range(len(env.world.agents)):
            env.world.agents[i] = pickle.load(file_path)
        obs_n = []
        agents = env.world.agents
        for agent in agents:
            obs_n.append(env._get_obs(agent))'''

        print('Starting iterations...')
        while True:
            # get constraint_values
            c_n = env.get_constraint_values()
            is_any_collision = env.is_any_collision()
            if is_any_collision[0]:
                cumulative_constraint_violations = cumulative_constraint_violations + 1
            '''if c_n[0][0] > 0:
                print("there is a c_n > 0")'''
            # get action
            action_n = [
                agent.action_real(obs, c, env)
                for agent, obs, c in zip(trainers, obs_n, c_n)
            ]
            action_real = [action_n[0][0]]
            if_call = [action_n[0][2]]
            action_n = [action_n[0][1]]
            data_save.append(
                np.concatenate([obs_n[0], action_n[0], action_n[0]]))
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n,
                                                        if_call=if_call)
            '''is_any_collision_new = env.is_any_collision()
            if is_any_collision_new[0]:
                env.is_any_collision()
                dist = np.sqrt(np.sum(np.square(env.agents[0].state.p_pos - env.world.landmarks[0].state.p_pos))) -\
                       (env.agents[0].size + env.world.landmarks[0].size)
                # print("aaa", env.agents[0].state.p_pos, dist)'''

            # new c_n
            # new_c_n = env.get_constraint_values()
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len) or \
                       (env.agents[0].state.p_pos[0] - env.world.landmarks[-1].state.p_pos[0]) > 1.5
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                if done:
                    num_done = num_done + 1

                data_save.append(
                    np.concatenate([obs_n[0], action_n[0], action_n[0]]))
                data_save = np.array(data_save)
                '''np.savetxt("data_save.txt", data_save)'''  # 缺省按照'%.18e'格式保存数据,以空格分隔

                # plot x, y, v, theta
                a = data_save
                V = a[:, 1]
                x = a[:, 2]
                y = a[:, 3]
                theta = a[:, 4]
                omega = a[:, 5]
                # action_n = a[:, 26] - a[:, 27]
                # action_real = a[:, 31] - a[:, 32]
                fig, ax0 = plt.subplots()
                for i, landmark in enumerate(env.world.landmarks[:-1]):
                    p_pos = landmark.state.p_pos
                    r = landmark.size
                    circle = mpathes.Circle(p_pos,
                                            r,
                                            facecolor='w',
                                            edgecolor='forestgreen',
                                            linestyle='-.')
                    ax0.add_patch(circle)
                for i, landmark in enumerate(env.world.landmarks):
                    p_pos = landmark.state.p_pos
                    r = (landmark.size -
                         0.09) if landmark is not env.world.landmarks[
                             -1] else landmark.size
                    circle = mpathes.Circle(p_pos, r, facecolor='forestgreen')
                    ax0.add_patch(circle)
                for i in range(len(x)):
                    p_pos = np.array([x[i], y[i]])
                    r = env.world.agents[0].size
                    circle = mpathes.Circle(p_pos, r, facecolor='darkgreen')
                    ax0.add_patch(circle)
                ax0.set_xlim((-1, 40))
                ax0.set_ylim((-10, 10))
                ax0.axis('equal')
                ax0.set_title("x-y")
                x1 = [-1, 40]
                y1 = [10, 10]
                y2 = [-10, -10]
                ax0.plot(x1, y1, color='forestgreen', linestyle='-.')
                ax0.plot(x1, y2, color='forestgreen', linestyle='-.')
                plt.show()
                '''fig, ax = plt.subplots(ncols=2, nrows=2)
                for i, landmark in enumerate(env.world.landmarks):
                    p_pos = landmark.state.p_pos
                    r = landmark.size
                    circle = mpathes.Circle(p_pos, r)
                    ax[0, 0].add_patch(circle)
                for i in range(len(x)):
                    p_pos = np.array([x[i], y[i]])
                    r = env.world.agents[0].size
                    circle = mpathes.Circle(p_pos, r)
                    ax[0, 0].add_patch(circle)
                ax[0, 0].set_xlim((-1, 20))
                ax[0, 0].set_ylim((-10.3, 10.3))
                ax[0, 0].set_title("x-y")
                ax[0, 0].axis('equal')
                ax[0, 1].plot(theta)
                ax[0, 1].set_title("theta")
                ax[1, 0].plot(omega)
                ax[1, 0].set_title("omega")
                # ax[1, 1].plot(action_n * 0.12)
                # ax[1, 1].set_title("action_n")
                plt.show()'''

                # reset and continue
                data_save = []
                obs_n = env.reset()
                # env0 = copy.deepcopy(env)
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            '''for agent in trainers:
                loss = agent.update(trainers, train_step)'''

            # save model, display training output
            if (done or terminal) and ((len(episode_rewards) - 1) %
                                       arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) - 1,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                cumulative_constraint_violations, num_done,
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) - 1,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                cumulative_constraint_violations, num_done,
                                round(time.time() - t_start, 3)))
                    # print(trainers[0].safety_layer.num_call)
                t_start = time.time()
                num_done = 0
                cumulative_constraint_violations = 0
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #29
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark or arglist.plot:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        plot_data = []

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            plot_d = env.get_plot_data()

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            plot_data.append(plot_d)

            if done or terminal:
                if arglist.plot:
                    if arglist.scenario == "simple_spread" or arglist.scenario == "simple_spread_obstacles":
                        plot_spread(plot_data)
                    if arglist.scenario == "simple_formation" or arglist.scenario == "simple_formation_obstacles":
                        plot_formation(plot_data)
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])
                plot_data = []

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(episode_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(agent_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Example #30
0
def train(arglist):
    with U.single_threaded_session():
        # create world
        world = World()

        # Create environment
        env = MultiAgentTorcsEnv(world,
                                 0,
                                 world.reset_world,
                                 world.reward,
                                 world.observation,
                                 done_callback=world.done)

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = env.adv  #min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)

        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()

        #todo : call reset function here
        os.system("pkill torcs")
        os.system("cd ~/vtorcs3 && ./torcs &"
                  )  #use the location of torcs installation on your system
        time.sleep(0.5)
        os.system('sh autostart.sh')
        time.sleep(1)

        obs_n = []
        world.initialize_agents()
        for agent in env.agents:
            obs_n.append(world.observation(agent))
        #obs_n = env.reset()

        episode_step = 0
        train_step = 0
        t_start = time.time()
        episode_count = 0
        epsilon = 1
        EXPLORE = 100000.
        train_indicator = 1
        print('Starting iterations...')
        while True:
            print("Episode number: " + str(episode_count) + " ")
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(
                action_n, epsilon, train_indicator)
            episode_step += 1

            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                epsilon -= 1.0 / EXPLORE
                episode_step = 0
                episode_rewards.append(0)
                episode_count += 1
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1
            world.step = train_step
            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            #NA for TORCS env
            # for displaying learned policies
            '''if arglist.display:
                time.sleep(0.1)
                env.render()
                continue'''

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)
            l2 = "Loss is " + str(loss) + "\n"
            with open("log2.txt", "a") as f:
                f.write(l2)
            print(l2)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break