Esempio n. 1
0
def train(model_path='models/model.h5',
          opponent_policy=random_choice,
          num_episodes=1000,
          agent_params={},
          **kwargs):
    stats = statistics.default_stats()
    plt_data = statistics.plot_stats(stats, data=None)

    agent = DQNAgent(**agent_params)

    for episode in range(num_episodes):
        print('Episode {}/{}'.format(episode, num_episodes))
        env = Environment(opponent_policy=opponent_policy,
                          agent_color=board.RED,
                          agent_first_turn=True)
        done = False
        episode_length = 0
        while not done:
            state = env.get_state()
            action = agent.act_epsilon_greedy(state)
            next_state, reward, event = env.step(action)
            done = event != board.EVENT_IN_GAME
            agent.remember(state, action, reward, next_state, done)
            agent.replay(stats=stats)

            if event == board.EVENT_WIN:
                print('Won Game!')

            episode_length += 1

        stats['episode_results'].append(event)
        stats['episode_lengths'].append(episode_length)

        plt_data = statistics.plot_stats(stats, data=plt_data)
        plt.pause(0.0001)

        if episode % 100 == 0:
            agent.save(model_path)

    agent.save(model_path)
    saved_params = {'agent_params': agent_params, 'num_episodes': num_episodes}
    statistics.save_stats(
        stats, saved_params,
        "stats/stats-{}.json".format(time.strftime("%Y%m%d-%H%M%S")))
    statistics.plot_stats(stats, data=plt_data)
    plt.show()
Esempio n. 2
0
    episode_reward = 0.0
    episode_count = 0

    obs = env.reset()
    pb = tqdm(range(-params.memory_initial, params.max_steps))
    plotter = VisdomLinePlotter()
    reward_history = deque(maxlen=100)
    for i in pb:
        if args.render:
            env.render()

        idx = memory.store_obs(obs)
        state = memory.get_stacked_obs(idx)

        action = agent.act_epsilon_greedy(state, eps_schedule.get(i))
        next_obs, reward, done, _ = env.step(action)
        episode_reward += reward
        memory.store_effect(idx, action, np.sign(reward), done)

        if done:
            next_obs = env.reset()
            episode_count += 1
            reward_history.append(episode_reward)

            pb.set_description(
                f"episode: {episode_count}, reward: {episode_reward}, eps: {eps_schedule.get(i)*100:.2f}%"
            )
            plotter.plot('episode reward', 'episode return', "Episode Return",
                         episode_count, episode_reward)
            plotter.plot('episode reward', 'average return', "Episode Return",