def trainModelWithDMC(p_boardSizeX,
                      p_boardSizeY,
                      p_episodes,
                      savePath,
                      p_temperature=20.0,
                      p_gamma=0.9,
                      howFar=2,
                      rewardFunc=game.rewardFunc2,
                      selfPlayFixOpp=False,
                      startingModel=None,
                      opponentConstr=None,
                      opponentStrength=None):

    model = startingModel

    if startingModel is None:
        model = Sequential()
        model.add(
            Dense(70,
                  kernel_initializer='lecun_uniform',
                  activation='relu',
                  input_shape=(2 * (4 + (2 * howFar + 1) *
                                    (2 * howFar + 1)), )))

        model.add(
            Dense(35, kernel_initializer='lecun_uniform', activation='relu'))

        model.add(
            Dense(3, kernel_initializer='lecun_uniform', activation='linear'))

        model.compile(loss='mean_squared_error', optimizer=RMSprop())

    episodes = p_episodes
    gamma = p_gamma
    startingTemperature = p_temperature
    temperature = p_temperature
    batchSize = 1
    buffer = 1
    replay = []
    h = 0
    updateStep = 0
    for i in range(episodes):

        rewardList = []
        stateList = []
        actionList = []

        #init board
        board = game.Board(p_boardSizeX, p_boardSizeY)

        #set opponent
        if (opponentConstr is None):
            if (selfPlayFixOpp == True):
                oppModel = clone_model(model)
                oppModel.set_weights(model.get_weights())
                board.setPlayers(
                    game.AITrainingReduFeatWOPlayer("1", board, model, howFar),
                    game.AITrainingReduFeatWOPlayer("2", board, oppModel,
                                                    howFar))
            else:
                board.setPlayers(
                    game.AITrainingReduFeatWOPlayer("1", board, model, howFar),
                    game.AITrainingReduFeatWOPlayer("2", board, model, howFar))
        else:
            try:
                opp = opponentConstr("2", board, opponentStrength)
            except TypeError:
                opp = opponentConstr("2")
            board.setPlayers(
                game.AITrainingReduFeatWOPlayer("1", board, model, howFar),
                opp)

        #set start strategy
        board.startGameWithPseudoRandomStartPositions()
        while (board.checkGameStatus() == 0):

            #Boltzman action selection
            board.player1.getDirection()
            Qprobs = game.softmax(board.player1.vals / temperature)
            action_value = np.random.choice(Qprobs[0], p=Qprobs[0])
            action = np.argmax(Qprobs[0] == action_value) - 1
            actionList.append(action)
            #Take action, observe new state S'

            state = board.to01ReducedFeaturesWithOpponent(
                board.player1, board.player2,
                howFar).reshape(1,
                                2 * (4 + (2 * howFar + 1) * (2 * howFar + 1)))
            stateList.append(state)

            board.movePlayers(action, board.player2.getDirection())

            gameStatus = board.checkGameStatus()

            #Observe reward
            reward = rewardFunc(gameStatus)

            rewardList.append(reward)

        dAC = discountedAccRewards(rewardList, gamma)
        for stepNum in range(len(rewardList)):

            if (len(replay) < buffer):
                replay.append(
                    (stateList[stepNum], actionList[stepNum], dAC[stepNum]))
            else:
                if (h < (buffer - 1)):
                    h += 1
                else:
                    h = 0
                replay[h] = (stateList[stepNum], actionList[stepNum],
                             dAC[stepNum])
                #randomly sample our experience replay memory
                minibatch = random.sample(replay, batchSize)
                X_train = []
                y_train = []

                for memory in minibatch:
                    state, action, accReward = memory
                    Qvals = model.predict(state, batch_size=1)
                    y = np.zeros((1, 3))
                    y[:] = Qvals[:]
                    y[0][action +
                         1] = accReward  #action + 1 because actions are -1,0,1
                    X_train.append(
                        state.reshape(
                            2 * (4 + (2 * howFar + 1) * (2 * howFar + 1)), ))
                    y_train.append(y.reshape(3, ))

                X_train = np.array(X_train)
                y_train = np.array(y_train)
                print("Game #: %s" % (i, ))
                model.fit(X_train,
                          y_train,
                          batch_size=batchSize,
                          epochs=1,
                          verbose=1)
                updateStep += 1
        if i % 10000 == 0:
            model.save(savePath)
        if temperature > 1.0:
            temperature -= (startingTemperature / episodes)
        else:
            temperature = 1.0
    model.save(savePath)
Beispiel #2
0
def trainModelWithDQL(p_boardSizeX,
                      p_boardSizeY,
                      p_episodes,
                      savePath,
                      p_temperature=20.0,
                      p_gamma=0.9,
                      fixedTargetSteps=500,
                      howFar=2,
                      rewardFunc=game.rewardFunc2,
                      selfPlayFixOpp=False,
                      startingModel=None,
                      opponentConstr=None,
                      opponentStrength=None):

    model = startingModel

    if startingModel is None:
        model = Sequential()
        model.add(
            Dense(70,
                  kernel_initializer='lecun_uniform',
                  activation='relu',
                  input_shape=(2 * (4 + (2 * howFar + 1) *
                                    (2 * howFar + 1)), )))

        model.add(
            Dense(35, kernel_initializer='lecun_uniform', activation='relu'))

        model.add(
            Dense(3, kernel_initializer='lecun_uniform', activation='linear'))

        model.compile(loss='mean_squared_error', optimizer=RMSprop())

    episodes = p_episodes
    gamma = p_gamma
    startingTemperature = p_temperature
    temperature = p_temperature
    batchSize = 50
    buffer = 500
    replay = []
    h = 0
    targetModel = None
    updateStep = 0
    for i in range(episodes):

        #init board
        board = game.Board(p_boardSizeX, p_boardSizeY)

        #set opponent
        if (opponentConstr is None):
            if (selfPlayFixOpp == True):
                oppModel = clone_model(model)
                oppModel.set_weights(model.get_weights())
                board.setPlayers(
                    game.AITrainingReduFeatWOPlayer("1", board, model, howFar),
                    game.AITrainingReduFeatWOPlayer("2", board, oppModel,
                                                    howFar))
            else:
                board.setPlayers(
                    game.AITrainingReduFeatWOPlayer("1", board, model, howFar),
                    game.AITrainingReduFeatWOPlayer("2", board, model, howFar))
        else:
            try:
                opp = opponentConstr("2", board, opponentStrength)
            except TypeError:
                opp = opponentConstr("2")
            board.setPlayers(
                game.AITrainingReduFeatWOPlayer("1", board, model, howFar),
                opp)

        #set start strategy
        board.startGameWithPseudoRandomStartPositions()
        while (board.checkGameStatus() == 0):

            #Boltzman action selection
            board.player1.getDirection()
            Qprobs = game.softmax(board.player1.vals / temperature)
            action_value = np.random.choice(Qprobs[0], p=Qprobs[0])
            action = np.argmax(Qprobs[0] == action_value) - 1

            #Take action, observe new state S'

            oldState = board.to01ReducedFeaturesWithOpponent(
                board.player1, board.player2,
                howFar).reshape(1,
                                2 * (4 + (2 * howFar + 1) * (2 * howFar + 1)))
            board.movePlayers(action, board.player2.getDirection())
            newState = board.to01ReducedFeaturesWithOpponent(
                board.player1, board.player2,
                howFar).reshape(1,
                                2 * (4 + (2 * howFar + 1) * (2 * howFar + 1)))

            gameStatus = board.checkGameStatus()

            #Observe reward
            reward = rewardFunc(gameStatus)

            if (len(replay) < buffer):  #if buffer not filled, add to it
                replay.append((oldState, action, reward, gameStatus, newState))
            else:
                if (h < (buffer - 1)):
                    h += 1
                else:
                    h = 0
                replay[h] = (oldState, action, reward, gameStatus, newState)
                #randomly sample our experience replay memory
                minibatch = random.sample(replay, batchSize)
                X_train = []
                y_train = []

                #fixed target model
                if targetModel is None:
                    targetModel = clone_model(model)
                    targetModel.set_weights(model.get_weights())
                elif updateStep % fixedTargetSteps == 0:
                    targetModel = clone_model(model)
                    targetModel.set_weights(model.get_weights())

                for memory in minibatch:
                    oldState, action, reward, gameStatus, newState = memory
                    oldQvals = model.predict(oldState, batch_size=1)
                    newQvals = targetModel.predict(newState, batch_size=1)
                    maxQval = np.max(newQvals)
                    y = np.zeros((1, 3))
                    y[:] = oldQvals[:]
                    if gameStatus == 0:  #non-terminal state
                        update = (reward + (gamma * maxQval))
                    else:  #terminal state
                        update = reward
                    y[0][action +
                         1] = update  #action + 1 because actions are -1,0,1
                    X_train.append(
                        oldState.reshape(
                            2 * (4 + (2 * howFar + 1) * (2 * howFar + 1)), ))
                    y_train.append(y.reshape(3, ))

                X_train = np.array(X_train)
                y_train = np.array(y_train)
                print("Game #: %s" % (i, ))
                model.fit(X_train,
                          y_train,
                          batch_size=batchSize,
                          epochs=1,
                          verbose=1)
                updateStep += 1
        if i % 10000 == 0:
            model.save(savePath)
        if temperature > 1.0:
            temperature -= (startingTemperature / episodes)
        else:
            temperature = 1.0
    model.save(savePath)