tensorboard_monitor.add_scalar_summary('score', 'per_game_summary') tensorboard_monitor.add_scalar_summary('training_loss', 'training_summary') for i in range(4): tensorboard_monitor.add_histogram_summary('Q%d_training' % i, 'training_summary') checkpoint_monitor = CheckpointRecorder(dqn_agent.dqn, replay_memory, counter, './checkpoints', sess) agi.add_listener(checkpoint_monitor) agi.add_listener(tensorboard_monitor) dqn_agent.add_listener(tensorboard_monitor) sess.run(tf.global_variables_initializer()) # Load the DQN and replay memory dqn_agent.dqn.restore('./checkpoints/dqn/7000000') replay_memory.load('./checkpoints/replay_memory/7000000') dqn_agent.update_target_network() def run(): cur_episode = 0 num_frames = 7000000 while counter.count < 50000000: score = agi.learn() tensorboard_monitor.record({'score': score}) elapsed_frames = counter.count - num_frames num_frames = counter.count print "Episode %d: Total Score = %d\t# Frames = %d\tTotal Frames = %d\tEpsilon: %f" % (cur_episode, score, elapsed_frames, num_frames, agent.epsilon) cur_episode += 1 print
def dqn_selfplay(model_name, load_model=False, model_filename=None, optimizer_filename=None): print("DQN -- Self-play training") env = make('hungry_geese') trainer = env.train( ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = DQNAgent(rows=11, columns=11, num_actions=3) buffer = ReplayBuffer() strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001) if load_model: agent.load_model_weights(model_filename) agent.load_optimizer_weights(optimizer_filename) start_episode = 0 end_episode = 50000 epochs = 32 batch_size = 128 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] enemies = [deepcopy(agent), deepcopy(agent), deepcopy(agent)] for episode in range(start_episode + 1, end_episode + 1): obs_dict = env.reset(4) obs_dict = obs_dict[0].observation epsilon = strategy.get_epsilon(episode - start_episode) ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 enemies_prev_direction = [0, 0, 0] while not done: ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) enemies_obs_dict = deepcopy(obs_dict) enemies_direction = [] for index, enemy, enemy_prev_direction in zip( range(3), enemies, enemies_prev_direction): enemies_obs_dict['index'] = index + 1 enemy_state = preprocess_state(enemies_obs_dict, enemy_prev_direction) enemy_action = enemy.select_action(enemy_state) enemy_direction = get_direction(enemy_prev_direction, enemy_action) enemies_direction.append(enemy_direction) step = env.step([ env.specification.action.enum[direction], env.specification.action.enum[enemies_direction[0]], env.specification.action.enum[enemies_direction[1]], env.specification.action.enum[enemies_direction[2]] ]) next_obs_dict, _, done = step[0].observation, ( step[0].reward - ep_reward), step[0].status == 'DONE' reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) buffer.add(state, action, reward, next_state, done) obs_dict = next_obs_dict prev_direction = direction enemies_prev_direction = enemies_direction ep_reward += reward if len(buffer) >= batch_size: for _ in range(epochs): states, actions, rewards, next_states, dones = buffer.get_samples( batch_size) agent.fit(states, actions, rewards, next_states, dones) print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if episode % 10 == 0: agent.update_target_network() if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) print('Epsilon: ' + str(round(epsilon, 3))) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() epsilon = 0 done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/self-play_dqn_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/self-play_dqn_' + model_name + '_' + str(episode) + '_optimizer.npy') if episode % 5000 == 0: enemies = enemies[1:] enemies.append(deepcopy(agent)) agent.save_model_weights('models/self-play_dqn_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/self-play_dqn_' + model_name + '_' + str(end_episode) + '_optimizer.npy') plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title('Reward') plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()