def action(self, acting_model: keras.Model, models_memory: SimpleMemory,
            environment: cb.ChessBoard, epsilon):
     flip = not environment.current_turn()
     moves, states, fens = environment.get_moves(flip=flip)
     best_move = None
     best_state = None
     best_state_fen = None
     if np.random.uniform(0, 1) < epsilon:
         moves_states_list = list(map(list, zip(moves, states, fens)))
         choices = len(moves_states_list)
         if choices > 1:
             random_element = moves_states_list[np.random.randint(
                 0, choices)]
         else:
             random_element = moves_states_list[0]
         best_move, best_state, best_state_fen = random_element
     else:
         best_move, best_state, best_state_fen = self.choose_action(
             acting_model, moves, states, fens)
     # make move
     environment.make_move(best_move, flip)
     real_prize = environment.get_results()
     best_state = np.array(best_state).reshape((384, ))
     real_prize = np.array([real_prize]).reshape((1, 1))
     if real_prize == cb.IGNORE_GO:
         return
     record = DQNChessRecord()
     record.state = best_state
     record.fen = best_state_fen
     record.reward = real_prize
     models_memory.add(record)
    def training(self, acting_model: keras.Model, target_model: keras.Model,
                 models_memory: SimpleMemory, batch_size: int, gamma: float):
        training_batch = models_memory.get_batch(
            batch_size, min_rows=self.START_TRAINING_AT)
        if training_batch is not None:
            samples = [[record.state, record.reward, record.fen]
                       for record in training_batch]
            states, prizes, fens = list(map(list, zip(*samples)))
            reinforced_prizes = []
            for p, f in zip(prizes, fens):
                training_board = cb.ChessBoard(starting_fen=f)
                p = p[0]
                if not training_board.game_over():
                    next_moves, next_states, next_fens = training_board.get_moves(
                    )
                    _, chosen_state, _ = self.choose_action(
                        acting_model, next_moves, np.array(next_states),
                        next_fens)
                    estimated_next_prize = target_model.predict(
                        np.array(chosen_state.reshape((1, 384))))[0]
                    reinforced_p = p + gamma * estimated_next_prize
                else:
                    reinforced_p = p
                reinforced_prizes.append(reinforced_p)

            states = np.array(states)
            reinforced_prizes = np.array(reinforced_prizes)
            acting_model.train_on_batch(states, reinforced_prizes)
    def training(self, acting_model: keras.Model, target_model: keras.Model,
                 models_memory: SimpleMemory, batch_size: int, gamma: float):
        training_batch = models_memory.get_batch(
            batch_size, min_rows=self.START_TRAINING_AT)
        if training_batch is not None:
            samples = [[record.state, record.reward, record.fen]
                       for record in training_batch]
            states, prizes, fens = list(map(list, zip(*samples)))
            reinforced_prizes = []
            for p, f in zip(prizes, fens):
                training_board = cb.ChessBoard(starting_fen=f)
                p = p[0]
                if not training_board.game_over():
                    # predict opponent's move
                    opponents_next_moves, opponents_next_states, opponents_next_fens = \
                        training_board.get_moves(flip=True)
                    opponents_move, _, _, _ = self.choose_action(
                        target_model, opponents_next_moves,
                        np.array(opponents_next_states), opponents_next_fens)
                    training_board.make_move(opponents_move, flipped=True)
                    opponents_prize = training_board.get_results()
                    if opponents_prize > cb.ATTACK:
                        reinforced_p = p - gamma * opponents_prize
                    else:
                        # get expected next move's reward
                        possible_moves, possible_states, possible_fens = training_board.get_moves(
                        )
                        _, _, _, estimated_next_prize = self.choose_action(
                            target_model, possible_moves,
                            np.array(possible_states), possible_fens)
                        estimated_next_prize = \
                            estimated_next_prize if isinstance(estimated_next_prize, int) else estimated_next_prize[0]

                        reinforced_p = p + gamma * (estimated_next_prize -
                                                    opponents_prize)
                else:
                    reinforced_p = p
                reinforced_prizes.append(reinforced_p)

            states = np.array(states)
            reinforced_prizes = np.array(reinforced_prizes)
            acting_model.train_on_batch(states, reinforced_prizes)
Example #4
0
                _, chosen_state, _ = choose_action(acting_model, next_moves,
                                                   np.array(next_states),
                                                   next_fens)
                estimated_next_prize = target_model.predict(
                    np.array(chosen_state.reshape((1, 384))))[0]
                reinforced_p = p + gamma * estimated_next_prize
            else:
                reinforced_p = p
            reinforced_prizes.append(reinforced_p)

        states = np.array(states)
        reinforced_prizes = np.array(reinforced_prizes)
        acting_model.train_on_batch(states, reinforced_prizes)


if LOAD:
    model_trainer = load_trainer(LOAD_FROM, NAME, action, training)
else:
    memory = SimpleMemory(MEMORY_SIZE)
    model_trainer = DQNTrainer(model, memory, action, training)

board = cb.ChessBoard()
for i in range(START_AT_STEP, TRAINING_STEPS):
    print("Step {} of {}".format(i + 1, TRAINING_STEPS))
    model_trainer.take_action(board, get_epsilon(i))
    model_trainer.train(batch_size=BATCH, gamma=GAMMA, theta=THETA)
    if i % 1000 == 0:
        model_trainer.save("tmp", "{}_{}".format(NAME, i))

model_trainer.save("final", "{}_{}k".format(NAME, int(TRAINING_STEPS / 1000)))
seed = 12345
np.random.seed(seed)
# temporary simple model for testing base concept
model_template = BuzdyganDQNv1Templte()
LOAD = True
LOAD_MEMORY = True
LOAD_FROM = "final/"

if LOAD:
    model_trainer = load_trainer(LOAD_FROM,
                                 "{}_60k".format(model_template.NAME),
                                 model_template.action,
                                 model_template.training,
                                 has_memory=LOAD_MEMORY)
    if not LOAD_MEMORY:
        memory = SimpleMemory(model_template.MEMORY_SIZE)
        model_trainer.add_memory(memory)
else:
    model = model_template.new_model(seed)
    memory = SimpleMemory(model_template.MEMORY_SIZE)
    model_trainer = DQNTrainer(model, memory, model_template.action,
                               model_template.training)

board = cb.ChessBoard()
for i in range(model_template.START_AT_STEP, model_template.TRAINING_STEPS):
    print("Step {} of {}".format(i + 1, model_template.TRAINING_STEPS))
    model_trainer.take_action(board, model_template.get_epsilon(i))
    model_trainer.train(batch_size=model_template.BATCH,
                        gamma=model_template.GAMMA,
                        theta=model_template.THETA)
    if i % model_template.SAVE_PER_STEPS == 0:
        for p, f in zip(prizes, fens):
            training_board = cb.ChessBoard(starting_fen=f)
            if not training_board.game_over():
                next_moves, next_states, next_fens = training_board.get_moves()
                _, chosen_state, _ = choose_action(acting_model, next_moves, next_states, next_fens)
                estimated_next_prize = target_model.predict(np.array(chosen_state).reshape((1, 1, 384)))[0]
                reinforced_p = p + gamma * estimated_next_prize
            else:
                reinforced_p = p
            reinforced_prizes.append(reinforced_p)

        states = np.array(states)
        reinforced_prizes = np.array(reinforced_prizes)
        acting_model.train_on_batch(states, reinforced_prizes)


memory = SimpleMemory(int(1e+5))
model_trainer = DQNTrainer(model, memory, action, training)

board = cb.ChessBoard()
TRAINING_STEPS = int(2e+5)
for i in range(TRAINING_STEPS):
    print("Step {} of {}".format(i+1, TRAINING_STEPS))
    model_trainer.take_action(board, 0.3)
    model_trainer.train(batch_size=32, gamma=0.99, theta=0.005)
    if i % 1000 == 0:
        model_trainer.save("./tmp_model.h5")

model_trainer.save("./model.h5")