gametrainer.py

import os
import gc
import time
import itertools
import random
import copy
import shutil
import traceback
from collections import deque
import dill
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator

from utils import Utils, RingBuf
from gameboard import GameBoard, GameStates, GameActions
from gamemodel import GameModel

class GameTrainer():
    def __init__(self, board_size, save_model=True, model_dir=None, debug=True, learning_rate=0.01, max_experience_history=600000):
        self.save_model = save_model
        self.model_dir = model_dir
        self.experience_history_path = self.model_dir + "exp_history.p"
        self.max_experience_history = max_experience_history
        self.debug = debug
        self.learning_rate = learning_rate

        self.q_network = GameModel(board_size, model_name='q_network', model_dir=self.model_dir, learning_rate=learning_rate)
        print(self.q_network.model.summary())
        self.q_hat = GameModel(board_size, model_name='q_hat', model_dir=self.model_dir, learning_rate=learning_rate)
        #self.q_hat = self.q_network.copy_weights_to(GameModel(board_size, model_name='q_hat', model_dir=self.model_dir, learning_rate=learning_rate))

    def calc_reward(self, oldboard, newboard):
        # The rewards should be normalized so that the max tile value can be changed without having to retrain
        # if newboard.game_state == GameStates.LOSE: return -1
        # nom_reward = np.clip((newboard.score - oldboard.score) // 2 , 1, None)
        # reward = np.log2(nom_reward) / (np.log2(newboard.max_tile) - 1)
        # return np.clip(reward, -1., 1.)
        # if newboard.game_state == GameStates.LOSE: return -1
        # if newboard.game_state == GameStates.WIN: return 1
        # return 0
        #max_reward = np.log2(oldboard.max_tile)
        # Mean of the score awarded for new tile placement - either a 2 or a 4 are equally likely so mean = 3
        if newboard.game_state == GameStates.LOSE: return -1
        if newboard.game_state == GameStates.WIN: return 1
        tile_placement_mean_score = np.mean(oldboard.bonus_mask * 3)
        reward = np.clip(2 * (newboard.score - oldboard.score - tile_placement_mean_score) / oldboard.max_tile, -1, 1)
        #print("Score: {0}".format(score))
        return reward

    def exec_action(self, gameboard, gameaction):
        oldboard = copy.deepcopy(gameboard)
        gameboard.make_move(gameaction)
        return (oldboard, self.calc_reward(oldboard, gameboard))

    def preprocess_state(self, gameboard):
        #return gameboard.board.astype(np.float32) / gameboard.max_tile
        return (np.log2(np.clip(gameboard.board, 1, gameboard.max_tile))) / np.log2(gameboard.max_tile)

    def get_action_probabilities(self, gameboard):
        return np.ravel(self.q_network(self.preprocess_state(gameboard)))

    def select_action(self, gameboard, epsilon):
        if len(gameboard.action_set) <= 0: return None

        # Return a random action with probability epsilon, otherwise return the model's recommendation
        if np.random.rand() < epsilon: return random.sample(gameboard.action_set, 1)[0]

        # Return the action with highest probability that is actually possible on the board, as predicted by the Q network
        action_probs = self.get_action_probabilities(gameboard)
        best_actions = [GameActions(i) for i in np.argsort(action_probs)[::-1]]
        return next(a for a in best_actions if a in gameboard.action_set)

    def calculate_y_target(self, newboards, actions_oh, rewards, gamestates, gamma):
        # Determine best actions for the future board states from the current Q-network
        # (this is the DDQN approach for faster training)
        newboards_arr = np.array(newboards)
        best_actions = np.argmax(self.q_network(newboards_arr), axis=1)
        q_hat_output = self.q_hat(newboards_arr, Utils.one_hot(best_actions, len(GameActions)))
        #q_hat_output = self.q_hat(np.array(newboards))
        # print("Q-hat output values: ", q_hat_output)
        q_hat_pred = np.max(q_hat_output, axis=1)
        q_values = q_hat_pred * np.array([0 if s != GameStates.IN_PROGRESS.value else gamma for s in gamestates])
        total_reward = np.array(rewards) + q_values
        return actions_oh * total_reward.reshape((-1, 1))

    def save_experience_history(self, D):
        if os.path.exists(self.experience_history_path): os.remove(self.experience_history_path)
        saved = False
        f_hist = None
        while (not saved):
            try:
                f_hist = open(self.experience_history_path, "wb")
                dill.dump(D, f_hist)
                saved = True
                print("Saved gameplay experience to " + self.experience_history_path)
            except Exception as e:
                traceback.print_exc()
                print(e)
                print("WARNING: failed to save experience replay history.  Will try again in 5 seconds...")
                time.sleep(5)
            finally:
                if not f_hist is None and 'close' in dir(f_hist): f_hist.close()
        if os.path.getsize(self.experience_history_path) > 0: shutil.copy2(self.experience_history_path, self.experience_history_path.replace('.p', '_BACKUP.p'))

    def restore_experience_history(self):
        f_hist = open(self.experience_history_path, "rb") if os.path.exists(self.experience_history_path) and os.path.getsize(self.experience_history_path) > 0 else None
        if f_hist is None: return RingBuf(self.max_experience_history)
        D = dill.load(f_hist)
        f_hist.close()
        if isinstance(D, RingBuf) and len(D) > 0:
            print("Restored gameplay experience from " + self.experience_history_path)
            return D
        return RingBuf(self.max_experience_history)

    def train_model(self, episodes=10, max_tile=2048, max_game_history=500, max_epsilon=1.0, min_epsilon=0.1, mini_batch_size=32, gamma=0.99, update_qhat_weights_steps=10000):
        # Training variables
        D = self.restore_experience_history()  # experience replay queue
        gamehistory = deque(maxlen=max_game_history)    # history of completed games
        epsilon = max_epsilon       # probability of selecting a random action.  This is annealed from 1.0 to 0.1 over time
        update_frequency = 4        # Number of actions selected before the Q-network is updated again
        globalstep = 0

        # approx_steps_per_episode = 200
        # episodes_per_tb_output = 100
        # steps_per_tb_output = approx_steps_per_episode * episodes_per_tb_output     # MUST BE A MULTIPLE OF update_frequency

        # # Prepare a callback to write TensorBoard debugging output
        # tbCallback = tf.keras.callbacks.TensorBoard(log_dir=self.model_dir, histogram_freq=1,
        #                                             batch_size=mini_batch_size, write_graph=False,
        #                                             write_images=False, write_grads=True)

        q_hat_group = 1
        f_log = None
        if self.debug:
            f_log = open(self.model_dir + "training_log.csv", "w")
            f_log.write("Q_hat_group,Avg_Pred,Avg_Target,Avg_Diff,Won_Lost_Games,My_Loss,Actual_Loss\n")


        # Loop over requested number of games (episodes)
        loss = 0
        for episode in range(episodes):
            # New game
            gameboard = GameBoard(self.q_network.board_size, max_tile=max_tile)

            # Play the game
            stepcount = 0
            while gameboard.game_state == GameStates.IN_PROGRESS:
                stepcount += 1
                globalstep += 1

                # Select an action to perform.  It will be a random action with probability epsilon, otherwise
                # the action with highest probability from the Q-network will be chosen
                action = self.select_action(gameboard, epsilon)
                oldboard, reward = self.exec_action(gameboard, action)

                # Append the (preprocessed) original board, selected action, reward and new board to the history
                # This is to implement experience replay for reinforcement learning
                # Ensure history size is capped at max_history by randomly replacing an experience in the queue if necessary
                experience = (self.preprocess_state(oldboard), action.value, reward, self.preprocess_state(gameboard), gameboard.game_state.value)
                D.append(experience)

                # Perform a gradient descent step on the Q-network when a game is finished or every so often
                #if globalstep % update_frequency == 0 and len(D) >= mini_batch_size:
                if len(D) >= max(mini_batch_size, self.max_experience_history) and globalstep % update_frequency == 0:
                    # Randomly sample from the experience history and unpack into separate arrays
                    batch = [D[i] for i in np.random.randint(0, len(D), mini_batch_size)]
                    oldboards, actions, rewards, newboards, gamestates = [list(k) for k in zip(*batch)]

                    # One-hot encode the actions for each of these boards as this will form the basis of the
                    # loss calculation
                    actions_one_hot = Utils.one_hot(actions, len(GameActions))

                    # Compute the target network output using the Q-hat network, actions and rewards for each
                    # sampled history item
                    y_target = self.calculate_y_target(newboards, actions_one_hot, rewards, gamestates, gamma)

                    #print("Rewards:{0}".format(" ".join(["{:.2f}".format(r) for r in rewards])))

                    # Perform a single gradient descent update step on the Q-network
                    # callbacks = []
                    # if self.debug and (globalstep % steps_per_tb_output == 0): callbacks.append(tbCallback)
                    X = [np.array(oldboards).reshape((-1, self.q_network.board_size, self.q_network.board_size, 1)), actions_one_hot]

                    # if len(callbacks) > 0:
                    #     self.q_network.model.fit(x=X, y=y_target, validation_split=0.16, epochs=1, verbose=False, callbacks=callbacks)
                    # else:
                    #loss += self.q_network.model.train_on_batch(x=X, y=y_target)

                    write_log_entry = (globalstep % (update_frequency * 10) == 0) and f_log
                    if write_log_entry:
                        y_pred = self.q_network(oldboards, actions_one_hot)
                        avg_pred = np.mean(np.abs(np.sum(y_pred, axis=1)))
                        avg_target = np.mean(np.abs(np.sum(y_target, axis=1)))
                        diff = np.sum(y_target - y_pred, axis=1)
                        avg_diff = np.mean(np.abs(diff))
                        my_loss = np.mean(np.square(diff))
                        won_lost_games = np.sum(np.array(gamestates) < GameStates.IN_PROGRESS.value)

                    curloss = self.q_network.model.train_on_batch(x=X, y=y_target)
                    loss += curloss

                    if write_log_entry:
                        f_log.write("{:d},{:.7f},{:.7f},{:.7f},{:d},{:.7f},{:.7f}\n".format(q_hat_group, avg_pred, avg_target, avg_diff, won_lost_games, my_loss, curloss))


                # Every so often, copy the network weights over from the Q-network to the Q-hat network
                # (this is required for network weight convergence)
                if globalstep % update_qhat_weights_steps == 0:
                    def test_weight_update(testboard):
                        board_processed = self.preprocess_state(testboard)
                        q_res = np.ravel(self.q_network(board_processed))
                        q_hat_res = np.ravel(self.q_hat(board_processed))
                        print("\nQ-network result on testboard: {0}".format(q_res))
                        print("Q-hat result on testboard: {0}".format(q_hat_res))
                        print("Difference: {0}\n".format(q_res - q_hat_res))

                    test_weight_update(oldboard)

                    self.q_network.copy_weights_to(self.q_hat)
                    print("Weights copied to Q-hat network")
                    self.q_network.save_to_file()

                    test_weight_update(oldboard)

                    lr_new = self.learning_rate / (1 + episode / (episodes/2))
                    #lr_new = self.learning_rate / np.sqrt(episode)
                    self.q_network.compile(learning_rate=lr_new)
                    print("Q-network learning rate updated to {:.6f}".format(lr_new))

                    q_hat_group += 1

                # Perform annealing on epsilon
                epsilon = max(min_epsilon, min_epsilon + ((max_epsilon - min_epsilon) * (episodes - episode) / episodes))
                #epsilon = max_epsilon
                #if self.debug: print("epsilon: {:.4f}".format(epsilon))

            # Append metrics for each completed game to the game history list
            gameresult = (gameboard.game_state.value, gameboard.score, stepcount, gameboard.largest_tile_placed)
            if len(gamehistory) >= max_game_history: gamehistory.popleft()
            gamehistory.append(gameresult)
            print('result: {0}, score: {1}, steps: {2}, max tile: {3}'.format(GameStates(gameresult[0]).name, gameresult[1], gameresult[2], gameresult[3]))
            if (episode + 1) % 20 == 0:
                games_to_retrieve = min(len(gamehistory), 100)
                last_x_games = list(itertools.islice(gamehistory, len(gamehistory) - games_to_retrieve, None))
                last_x_results = list(zip(*last_x_games))[0]
                games_won = np.sum([1 if r == GameStates.WIN.value else 0 for r in last_x_results])
                print("\nEpisode {0}/{1}".format(episode + 1, episodes))
                print("History queue {0}/{1}".format(len(D), self.max_experience_history))
                print("Game win % (for last {:d} games): {:.1f}%".format(games_to_retrieve, 100. * (games_won / games_to_retrieve)))
                print("Epsilon = {:.3f}".format(epsilon))
                print("Training loss: {:.5f}\n".format(loss))
                loss = 0

            # Save experience history to disk periodically
            if self.save_model and (episode + 1) % 2000 == 0:
                self.save_experience_history(D)
                # Output garbage
                print("GC.isenabled() = {0}".format(gc.isenabled()))
                print("Garbage:", gc.garbage)
                print("Counts:", gc.get_count())
                print("globals() = ", sorted(list(globals().keys())))

        # Perform one final model weight save for next run
        if self.save_model:
            self.q_network.save_to_file()
            self.save_experience_history(D)

        if f_log: f_log.close()

        return gamehistory

    @staticmethod
    def display_training_history(gamehistory):
        results, scores, stepcounts, max_tiles = list(zip(*gamehistory))
        resultpercentages = np.cumsum([1 if r == GameStates.WIN.value else 0 for r in results]) / range(1, len(results) + 1)
        print("Final win %: {:2f}".format(resultpercentages[-1] * 100.))

        x = range(1, len(results) + 1)
        fig, ax_arr = plt.subplots(nrows=4, ncols=1, sharex=True, figsize=(5, 10))
        ax_arr[0].plot(x, resultpercentages)
        ax_arr[0].set_ylim(bottom=0)
        ax_arr[0].set_title('Win % (cumulative) = {:.2f}%'.format(resultpercentages[-1] * 100.))

        ax_arr[1].plot(x, scores)
        ax_arr[1].set_title('Score')

        ax_arr[2].plot(x, stepcounts)
        ax_arr[2].set_title('Actions per Game')

        ax_arr[3].plot(x, max_tiles)
        ax_arr[3].set_title('Max Tile Placed')

        ax_arr[3].set_xlabel("Game #")
        ax_arr[3].xaxis.set_major_locator(MaxNLocator(integer=True))

        fig.canvas.set_window_title("2048 Game Training Results")
        plt.get_current_fig_manager().window.state('zoomed')
        plt.show()

        return gamehistory