Beispiel #1
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["loss_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        episode_losses = np.zeros(env.n)
        episode_rewards = np.zeros(env.n)
        collision_count = np.zeros(env.n)
        steps = 0

        while True:
            steps += 1

            # render
            if args.render:
                env.render()
                time.sleep(0.1)

            # act
            actions = []
            actions_onehot = []
            for i in range(env.n):
                action = dqns[i].choose_action(states[i])
                speed = 0.9 if env.agents[i].adversary else 1

                onehot_action = np.zeros(n_actions[i])
                onehot_action[action] = speed
                actions_onehot.append(onehot_action)
                actions.append(action)

            # step
            states_next, rewards, done, info = env.step(actions_onehot)

            # learn
            if not args.testing:
                size = memories[0].pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                for i in range(env.n):
                    if done[i]:
                        rewards[i] -= 50

                    memories[i].remember(states[i], actions[i], rewards[i],
                                         states_next[i], done[i])

                    if memories[i].pointer > batch_size * 10:
                        history = dqns[i].learn(*memories[i].sample(batch))
                        episode_losses[i] += history.history["loss"][0]
                    else:
                        episode_losses[i] = -1

            states = states_next
            episode_rewards += rewards
            collision_count += np.array(
                simple_tag_utilities.count_agent_collisions(env))

            # reset states if done
            if any(done):
                episode_rewards = episode_rewards / steps
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.extend([episode_rewards[i] for i in range(env.n)])
                statistic.extend([episode_losses[i] for i in range(env.n)])
                statistic.extend([dqns[i].eps_greedy for i in range(env.n)])
                statistic.extend(collision_count.tolist())
                statistics.add_statistics(statistic)
                if episode % 25 == 0:
                    print(statistics.summarize_last())
                break

        if episode % checkpoint_interval == 0:
            statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode))
            general_utilities.save_dqn_weights(
                dqns, "{}_{}_".format(weights_filename_prefix, episode))
            if episode >= checkpoint_interval:
                os.remove("{}_{}.csv".format(csv_filename_prefix,
                                             episode - checkpoint_interval))

    return statistics
Beispiel #2
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.append("done")
    statistics_header.append("reward")
    statistics_header.extend(
        ["loss_{}".format(i) for i in range(env.num_agents)])
    statistics_header.extend(
        ["eps_greedy_{}".format(i) for i in range(env.num_agents)])
    statistics_header.extend(
        ["Agent Energy Left_{}".format(i) for i in range(env.num_agents)])
    statistics_header.extend(
        ["Task Energy Left_{}".format(i) for i in range(env.num_agents)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        # episode_losses = np.zeros(env.n)
        # episode_rewards = np.zeros(env.n)
        # collision_count = np.zeros(env.n)
        episode_losses = np.zeros(env.num_agents)
        episode_rewards = 0

        steps = 0

        all_states = [states]
        while steps <= 600:
            steps += 1

            # render
            # if args.render:
            #     env._render()

            # act
            actions = []
            # n represents agents' number
            for i in range(env.num_agents):
                action = dqns[i].choose_action(states)
                actions.append(action)

            # step
            states_next, rewards, done, info = env.step(actions)
            all_states.append(states_next)
            # learn
            if not args.testing:
                size = memories[0].pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                for i in range(env.num_agents):
                    memories[i].remember(states, actions[i], rewards,
                                         states_next, done)

                    if memories[i].pointer > batch_size * 10:
                        history = dqns[i].learn(*memories[i].sample(batch))
                        episode_losses[i] += history.history["loss"][0]
                    else:
                        for i in range(env.num_agents):
                            episode_losses[i] = -1

            states = states_next
            episode_rewards += rewards
            # reset states if done
            if done or steps >= 600:
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.append(done)
                statistic.append(episode_rewards)
                statistic.extend(
                    [episode_losses[i] for i in range(env.num_agents)])
                statistic.extend(
                    [dqns[i].eps_greedy for i in range(env.num_agents)])
                statistic.extend([env.B_k[i] for i in range(env.num_agents)])
                statistic.extend([env.T_i[i] for i in range(env.num_agents)])
                statistics.add_statistics(statistic)
                if episode % 1 == 0:
                    print(statistics.summarize_last())

                if done:
                    with open('/save/states/episode{}_states.txt'.format(
                            episode),
                              mode='w') as myfile:
                        for each in all_states:
                            myfile.write(each)
                            myfile.write('\n')
                    myfile.close()
                break

        if episode % checkpoint_interval == 0:
            statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode))
            general_utilities.save_dqn_weights(
                dqns, "{}_{}_".format(weights_filename_prefix, episode))
            if episode >= checkpoint_interval:
                os.remove("{}_{}.csv".format(csv_filename_prefix,
                                             episode - checkpoint_interval))

    return statistics
Beispiel #3
0
    tf.set_random_seed(args.random_seed)

    # init DQNs
    n_actions = [env.action_space[i].n for i in range(env.n)]
    state_sizes = [env.observation_space[i].shape[0] for i in range(env.n)]
    memories = [Memory(args.memory_size) for i in range(env.n)]
    dqns = [
        DQN(n_actions[i], state_sizes[i], eps_greedy=epsilon_greedy[i])
        for i in range(env.n)
    ]

    general_utilities.load_dqn_weights_if_exist(
        dqns, args.experiment_prefix + args.weights_filename_prefix)

    start_time = time.time()

    # play
    statistics = play(args.episodes, args.render, args.testing,
                      args.checkpoint_frequency,
                      args.experiment_prefix + args.weights_filename_prefix,
                      args.experiment_prefix + args.csv_filename_prefix,
                      args.batch_size)

    # bookkeeping
    print("Finished {} episodes in {} seconds".format(args.episodes,
                                                      time.time() -
                                                      start_time))
    general_utilities.save_dqn_weights(
        dqns, args.experiment_prefix + args.weights_filename_prefix)
    statistics.dump(args.experiment_prefix + args.csv_filename_prefix + ".csv")
Beispiel #4
0
def play(episodes, is_render, is_testing, checkpoint_interval,
         weights_filename_prefix, csv_filename_prefix, batch_size):
    # init statistics. NOTE: simple tag specific!
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["loss_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)])
    print("Collecting statistics {}:".format(" ".join(statistics_header)))
    statistics = general_utilities.Time_Series_Statistics_Store(
        statistics_header)

    for episode in range(args.episodes):
        states = env.reset()
        episode_losses = np.zeros(env.n)
        episode_rewards = np.zeros(env.n)
        collision_count = np.zeros(env.n)
        steps = 0

        while True:
            steps += 1

            # render
            if args.render:
                env.render()

            # act
            actions = []
            for i in range(env.n):
                if i < h:
                    action = dqns[i].choose_action(states[i])
                    speed = 0.9 if env.agents[i].adversary else 1

                    onehot_action = np.zeros(n_actions[i])
                    onehot_action[action] = speed
                    actions.append(onehot_action)
                else:
                    action = np.clip(
                        actors[i].choose_action(states[i]) + actors_noise[i](),
                        -2, 2)
                    actions.append(action)

            # step
            states_next, rewards, done, info = env.step(actions)

            # learn
            if not args.testing:
                size = memories[0].pointer
                batch = random.sample(
                    range(size), size) if size < batch_size else random.sample(
                        range(size), batch_size)

                for i in range(env.n):
                    if done[i]:
                        rewards[i] -= 50

                    if i < h:
                        memories[i].remember(states[i], np.argmax(actions[i]),
                                             rewards[i], states_next[i],
                                             done[i])
                    else:
                        memories[i].remember(states, actions, rewards[i],
                                             states_next, done[i])

                    if i < h:
                        if memories[i].pointer > batch_size * 10:
                            s, a, r, sn, done = memories[i].sample(batch)
                            history = dqns[i].learn(s, a, r, sn, done)
                            episode_losses[i] += history.history["loss"][0]
                        else:
                            episode_losses[i] = -1
                    else:
                        if memories[i].pointer > batch_size * 10:
                            s, a, r, sn, _ = memories[i].sample(batch, env.n)
                            r = np.reshape(r, (batch_size, 1))
                            loss = critics[i].learn(s, a, r, sn)
                            actors[i].learn(actors, s)
                            episode_losses[i] += loss
                        else:
                            episode_losses[i] = -1

            states = states_next
            episode_rewards += rewards
            collision_count += np.array(
                simple_tag_utilities.count_agent_collisions(env))

            # reset states if done
            if any(done):
                episode_rewards = episode_rewards / steps
                episode_losses = episode_losses / steps

                statistic = [episode]
                statistic.append(steps)
                statistic.extend([episode_rewards[i] for i in range(env.n)])
                statistic.extend([episode_losses[i] for i in range(env.n)])
                statistic.extend([
                    *[dqns[i].eps_greedy for i in range(h)],
                    *[-1 for i in range(h, env.n - h + 1)]
                ])
                statistic.extend(collision_count.tolist())
                statistics.add_statistics(statistic)
                if episode % 25 == 0:
                    print(statistics.summarize_last())
                break

        if episode % checkpoint_interval == 0:
            if i < h:
                statistics.dump("{}_{}.csv".format(csv_filename_prefix,
                                                   episode))
                general_utilities.save_dqn_weights(
                    dqns, "{}_{}_".format(weights_filename_prefix, episode))
                if episode >= checkpoint_interval:
                    os.remove("{}_{}.csv".format(csv_filename_prefix, episode -
                                                 checkpoint_interval))
            else:
                statistics.dump("{}_{}.csv".format(csv_filename_prefix,
                                                   episode))
                if not os.path.exists(weights_filename_prefix):
                    os.makedirs(weights_filename_prefix)
                save_path = saver.save(session,
                                       os.path.join(weights_filename_prefix,
                                                    "models"),
                                       global_step=episode)
                print("saving model to {}".format(save_path))
                if episode >= checkpoint_interval:
                    os.remove("{}_{}.csv".format(csv_filename_prefix, episode -
                                                 checkpoint_interval))

    return statistics