def eval_against_all(genomes, config): for genome_id, genome in genomes: model = neat.nn.FeedForwardNetwork.create(genome, config) genome.fitness = 0.0 sizeX = 5 sizeY = 5 for opp in [(game.RandomPlayer,0),(game.AILook1Player,1),(game.AILook1Player,5)]: for episode in range(0,50): board = game.Board(sizeX, sizeY) if opp[0].__name__ == "RandomPlayer": board.setPlayers(game.AITrainingReduFeatPlayer("1",board,model,2), opp[0]("2")) else: board.setPlayers(game.AITrainingReduFeatPlayer("1",board,model,2), opp[0]("2",board,opp[1])) board.startGameOnDifferentEnds() result = board.play(False) if result == 1: genome.fitness += 1.0 elif result == 2: genome.fitness -= 1.0
def trainModelWithDMC(p_boardSizeX, p_boardSizeY, p_episodes, savePath, p_temperature=20.0, p_gamma=0.9, howFar=3, rewardFunc=game.rewardFunc2, selfPlayFixOpp=False, startingModel=None, opponentConstr=None, opponentStrength=None): model = startingModel if startingModel is None: model = Sequential() model.add( Dense(70, kernel_initializer='lecun_uniform', activation='relu', input_shape=(4 + (2 * howFar + 1) * (2 * howFar + 1), ))) model.add( Dense(35, kernel_initializer='lecun_uniform', activation='relu')) model.add( Dense(3, kernel_initializer='lecun_uniform', activation='linear')) model.compile(loss='mean_squared_error', optimizer=RMSprop()) episodes = p_episodes gamma = p_gamma startingTemperature = p_temperature temperature = p_temperature batchSize = 1 buffer = 1 replay = [] h = 0 updateStep = 0 for i in range(episodes): rewardList = [] stateList = [] actionList = [] #init board board = game.Board(p_boardSizeX, p_boardSizeY) #set opponent if (opponentConstr is None): if (selfPlayFixOpp == True): oppModel = clone_model(model) oppModel.set_weights(model.get_weights()) board.setPlayers( game.AITrainingReduFeatPlayer("1", board, model, howFar), game.AITrainingReduFeatPlayer("2", board, oppModel, howFar)) else: board.setPlayers( game.AITrainingReduFeatPlayer("1", board, model, howFar), game.AITrainingReduFeatPlayer("2", board, model, howFar)) else: try: opp = opponentConstr("2", board, opponentStrength) except TypeError: opp = opponentConstr("2") board.setPlayers( game.AITrainingReduFeatPlayer("1", board, model, howFar), opp) #set start strategy board.startGameWithPseudoRandomStartPositions() while (board.checkGameStatus() == 0): #Boltzman action selection board.player1.getDirection() Qprobs = game.softmax(board.player1.vals / temperature) action_value = np.random.choice(Qprobs[0], p=Qprobs[0]) action = np.argmax(Qprobs[0] == action_value) - 1 actionList.append(action) #Take action, observe new state S' state = board.to01ReducedFeatures(board.player1, howFar).reshape( 1, 4 + (2 * howFar + 1) * (2 * howFar + 1)) stateList.append(state) board.movePlayers(action, board.player2.getDirection()) gameStatus = board.checkGameStatus() #Observe reward reward = rewardFunc(gameStatus) rewardList.append(reward) dAC = discountedAccRewards(rewardList, gamma) for stepNum in range(len(rewardList)): if (len(replay) < buffer): replay.append( (stateList[stepNum], actionList[stepNum], dAC[stepNum])) else: if (h < (buffer - 1)): h += 1 else: h = 0 replay[h] = (stateList[stepNum], actionList[stepNum], dAC[stepNum]) #randomly sample our experience replay memory minibatch = random.sample(replay, batchSize) X_train = [] y_train = [] for memory in minibatch: state, action, accReward = memory Qvals = model.predict(state, batch_size=1) y = np.zeros((1, 3)) y[:] = Qvals[:] y[0][action + 1] = accReward #action + 1 because actions are -1,0,1 X_train.append( state.reshape(4 + (2 * howFar + 1) * (2 * howFar + 1)), ) y_train.append(y.reshape(3, )) X_train = np.array(X_train) y_train = np.array(y_train) print("Game #: %s" % (i, )) model.fit(X_train, y_train, batch_size=batchSize, epochs=1, verbose=1) updateStep += 1 if i % 10000 == 0: model.save(savePath) if temperature > 1.0: temperature -= (startingTemperature / episodes) else: temperature = 1.0 model.save(savePath)