delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) # dqn.fit(enviro, callbacks=None, nb_steps=1750000, log_interval=10000) weights_filename = 'dqn_{}_weights.h5f'.format('PSF') checkpoint_weights_filename = 'dqn_' + 'PSF' + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format('PSF') callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(enviro, callbacks=None, verbose=2, nb_steps=N_steps, action_repetition=1, log_interval=1000, nb_max_episode_steps=episode_len) # dqn.test(enviro, nb_episodes=1, visualize=False) # new_state = Z * np.random.uniform(-1., 1., size=N_zern) # # new_state = np.array([1, 0.5]) # enviro.x0 = new_state.copy() # _obs = enviro.reset() # dqn.test(enviro, nb_episodes=1, nb_max_start_steps=0, visualize=False) # # # Try with lower gamma, more immediate reward # # # Check what's going on
def dqn_selfplay(model_name, load_model=False, model_filename=None, optimizer_filename=None): print("DQN -- Self-play training") env = make('hungry_geese') trainer = env.train( ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = DQNAgent(rows=11, columns=11, num_actions=3) buffer = ReplayBuffer() strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001) if load_model: agent.load_model_weights(model_filename) agent.load_optimizer_weights(optimizer_filename) start_episode = 0 end_episode = 50000 epochs = 32 batch_size = 128 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] enemies = [deepcopy(agent), deepcopy(agent), deepcopy(agent)] for episode in range(start_episode + 1, end_episode + 1): obs_dict = env.reset(4) obs_dict = obs_dict[0].observation epsilon = strategy.get_epsilon(episode - start_episode) ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 enemies_prev_direction = [0, 0, 0] while not done: ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) enemies_obs_dict = deepcopy(obs_dict) enemies_direction = [] for index, enemy, enemy_prev_direction in zip( range(3), enemies, enemies_prev_direction): enemies_obs_dict['index'] = index + 1 enemy_state = preprocess_state(enemies_obs_dict, enemy_prev_direction) enemy_action = enemy.select_action(enemy_state) enemy_direction = get_direction(enemy_prev_direction, enemy_action) enemies_direction.append(enemy_direction) step = env.step([ env.specification.action.enum[direction], env.specification.action.enum[enemies_direction[0]], env.specification.action.enum[enemies_direction[1]], env.specification.action.enum[enemies_direction[2]] ]) next_obs_dict, _, done = step[0].observation, ( step[0].reward - ep_reward), step[0].status == 'DONE' reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) buffer.add(state, action, reward, next_state, done) obs_dict = next_obs_dict prev_direction = direction enemies_prev_direction = enemies_direction ep_reward += reward if len(buffer) >= batch_size: for _ in range(epochs): states, actions, rewards, next_states, dones = buffer.get_samples( batch_size) agent.fit(states, actions, rewards, next_states, dones) print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if episode % 10 == 0: agent.update_target_network() if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) print('Epsilon: ' + str(round(epsilon, 3))) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() epsilon = 0 done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/self-play_dqn_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/self-play_dqn_' + model_name + '_' + str(episode) + '_optimizer.npy') if episode % 5000 == 0: enemies = enemies[1:] enemies.append(deepcopy(agent)) agent.save_model_weights('models/self-play_dqn_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/self-play_dqn_' + model_name + '_' + str(end_episode) + '_optimizer.npy') plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title('Reward') plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()