Exemple #1
0
               delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

# dqn.fit(enviro, callbacks=None, nb_steps=1750000, log_interval=10000)
weights_filename = 'dqn_{}_weights.h5f'.format('PSF')
checkpoint_weights_filename = 'dqn_' + 'PSF' + '_weights_{step}.h5f'
log_filename = 'dqn_{}_log.json'.format('PSF')
callbacks = [
    ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)
]
callbacks += [FileLogger(log_filename, interval=100)]

dqn.fit(enviro,
        callbacks=None,
        verbose=2,
        nb_steps=N_steps,
        action_repetition=1,
        log_interval=1000,
        nb_max_episode_steps=episode_len)

# dqn.test(enviro, nb_episodes=1, visualize=False)

# new_state = Z * np.random.uniform(-1., 1., size=N_zern)
# # new_state = np.array([1, 0.5])
# enviro.x0 = new_state.copy()
# _obs = enviro.reset()
# dqn.test(enviro, nb_episodes=1, nb_max_start_steps=0, visualize=False)
#
# # Try with lower gamma, more immediate reward
#
# # Check what's going on
Exemple #2
0
def dqn_selfplay(model_name,
                 load_model=False,
                 model_filename=None,
                 optimizer_filename=None):
    print("DQN -- Self-play training")

    env = make('hungry_geese')
    trainer = env.train(
        ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = DQNAgent(rows=11, columns=11, num_actions=3)
    buffer = ReplayBuffer()
    strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001)

    if load_model:
        agent.load_model_weights(model_filename)
        agent.load_optimizer_weights(optimizer_filename)

    start_episode = 0
    end_episode = 50000
    epochs = 32
    batch_size = 128

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    enemies = [deepcopy(agent), deepcopy(agent), deepcopy(agent)]

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = env.reset(4)
        obs_dict = obs_dict[0].observation
        epsilon = strategy.get_epsilon(episode - start_episode)
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0
        enemies_prev_direction = [0, 0, 0]

        while not done:
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_epsilon_greedy_action(state, epsilon)
            direction = get_direction(prev_direction, action)

            enemies_obs_dict = deepcopy(obs_dict)
            enemies_direction = []
            for index, enemy, enemy_prev_direction in zip(
                    range(3), enemies, enemies_prev_direction):
                enemies_obs_dict['index'] = index + 1
                enemy_state = preprocess_state(enemies_obs_dict,
                                               enemy_prev_direction)
                enemy_action = enemy.select_action(enemy_state)
                enemy_direction = get_direction(enemy_prev_direction,
                                                enemy_action)
                enemies_direction.append(enemy_direction)

            step = env.step([
                env.specification.action.enum[direction],
                env.specification.action.enum[enemies_direction[0]],
                env.specification.action.enum[enemies_direction[1]],
                env.specification.action.enum[enemies_direction[2]]
            ])
            next_obs_dict, _, done = step[0].observation, (
                step[0].reward - ep_reward), step[0].status == 'DONE'
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            buffer.add(state, action, reward, next_state, done)

            obs_dict = next_obs_dict
            prev_direction = direction
            enemies_prev_direction = enemies_direction

            ep_reward += reward

        if len(buffer) >= batch_size:
            for _ in range(epochs):
                states, actions, rewards, next_states, dones = buffer.get_samples(
                    batch_size)
                agent.fit(states, actions, rewards, next_states, dones)

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) +
              " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if episode % 10 == 0:
            agent.update_target_network()

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))
            print('Epsilon: ' + str(round(epsilon, 3)))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' +
                  str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                epsilon = 0
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_epsilon_greedy_action(state, epsilon)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(
                        env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/self-play_dqn_' + model_name +
                                     '_' + str(episode) + '.h5')
            agent.save_optimizer_weights('models/self-play_dqn_' + model_name +
                                         '_' + str(episode) + '_optimizer.npy')

        if episode % 5000 == 0:
            enemies = enemies[1:]
            enemies.append(deepcopy(agent))

    agent.save_model_weights('models/self-play_dqn_' + model_name + '_' +
                             str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/self-play_dqn_' + model_name + '_' +
                                 str(end_episode) + '_optimizer.npy')

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             training_rewards)
    plt.title('Reward')
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()