def action(self, acting_model: keras.Model, models_memory: SimpleMemory, environment: cb.ChessBoard, epsilon): flip = not environment.current_turn() moves, states, fens = environment.get_moves(flip=flip) best_move = None best_state = None best_state_fen = None if np.random.uniform(0, 1) < epsilon: moves_states_list = list(map(list, zip(moves, states, fens))) choices = len(moves_states_list) if choices > 1: random_element = moves_states_list[np.random.randint( 0, choices)] else: random_element = moves_states_list[0] best_move, best_state, best_state_fen = random_element else: best_move, best_state, best_state_fen = self.choose_action( acting_model, moves, states, fens) # make move environment.make_move(best_move, flip) real_prize = environment.get_results() best_state = np.array(best_state).reshape((384, )) real_prize = np.array([real_prize]).reshape((1, 1)) if real_prize == cb.IGNORE_GO: return record = DQNChessRecord() record.state = best_state record.fen = best_state_fen record.reward = real_prize models_memory.add(record)
def training(self, acting_model: keras.Model, target_model: keras.Model, models_memory: SimpleMemory, batch_size: int, gamma: float): training_batch = models_memory.get_batch( batch_size, min_rows=self.START_TRAINING_AT) if training_batch is not None: samples = [[record.state, record.reward, record.fen] for record in training_batch] states, prizes, fens = list(map(list, zip(*samples))) reinforced_prizes = [] for p, f in zip(prizes, fens): training_board = cb.ChessBoard(starting_fen=f) p = p[0] if not training_board.game_over(): next_moves, next_states, next_fens = training_board.get_moves( ) _, chosen_state, _ = self.choose_action( acting_model, next_moves, np.array(next_states), next_fens) estimated_next_prize = target_model.predict( np.array(chosen_state.reshape((1, 384))))[0] reinforced_p = p + gamma * estimated_next_prize else: reinforced_p = p reinforced_prizes.append(reinforced_p) states = np.array(states) reinforced_prizes = np.array(reinforced_prizes) acting_model.train_on_batch(states, reinforced_prizes)
def training(self, acting_model: keras.Model, target_model: keras.Model, models_memory: SimpleMemory, batch_size: int, gamma: float): training_batch = models_memory.get_batch( batch_size, min_rows=self.START_TRAINING_AT) if training_batch is not None: samples = [[record.state, record.reward, record.fen] for record in training_batch] states, prizes, fens = list(map(list, zip(*samples))) reinforced_prizes = [] for p, f in zip(prizes, fens): training_board = cb.ChessBoard(starting_fen=f) p = p[0] if not training_board.game_over(): # predict opponent's move opponents_next_moves, opponents_next_states, opponents_next_fens = \ training_board.get_moves(flip=True) opponents_move, _, _, _ = self.choose_action( target_model, opponents_next_moves, np.array(opponents_next_states), opponents_next_fens) training_board.make_move(opponents_move, flipped=True) opponents_prize = training_board.get_results() if opponents_prize > cb.ATTACK: reinforced_p = p - gamma * opponents_prize else: # get expected next move's reward possible_moves, possible_states, possible_fens = training_board.get_moves( ) _, _, _, estimated_next_prize = self.choose_action( target_model, possible_moves, np.array(possible_states), possible_fens) estimated_next_prize = \ estimated_next_prize if isinstance(estimated_next_prize, int) else estimated_next_prize[0] reinforced_p = p + gamma * (estimated_next_prize - opponents_prize) else: reinforced_p = p reinforced_prizes.append(reinforced_p) states = np.array(states) reinforced_prizes = np.array(reinforced_prizes) acting_model.train_on_batch(states, reinforced_prizes)
_, chosen_state, _ = choose_action(acting_model, next_moves, np.array(next_states), next_fens) estimated_next_prize = target_model.predict( np.array(chosen_state.reshape((1, 384))))[0] reinforced_p = p + gamma * estimated_next_prize else: reinforced_p = p reinforced_prizes.append(reinforced_p) states = np.array(states) reinforced_prizes = np.array(reinforced_prizes) acting_model.train_on_batch(states, reinforced_prizes) if LOAD: model_trainer = load_trainer(LOAD_FROM, NAME, action, training) else: memory = SimpleMemory(MEMORY_SIZE) model_trainer = DQNTrainer(model, memory, action, training) board = cb.ChessBoard() for i in range(START_AT_STEP, TRAINING_STEPS): print("Step {} of {}".format(i + 1, TRAINING_STEPS)) model_trainer.take_action(board, get_epsilon(i)) model_trainer.train(batch_size=BATCH, gamma=GAMMA, theta=THETA) if i % 1000 == 0: model_trainer.save("tmp", "{}_{}".format(NAME, i)) model_trainer.save("final", "{}_{}k".format(NAME, int(TRAINING_STEPS / 1000)))
seed = 12345 np.random.seed(seed) # temporary simple model for testing base concept model_template = BuzdyganDQNv1Templte() LOAD = True LOAD_MEMORY = True LOAD_FROM = "final/" if LOAD: model_trainer = load_trainer(LOAD_FROM, "{}_60k".format(model_template.NAME), model_template.action, model_template.training, has_memory=LOAD_MEMORY) if not LOAD_MEMORY: memory = SimpleMemory(model_template.MEMORY_SIZE) model_trainer.add_memory(memory) else: model = model_template.new_model(seed) memory = SimpleMemory(model_template.MEMORY_SIZE) model_trainer = DQNTrainer(model, memory, model_template.action, model_template.training) board = cb.ChessBoard() for i in range(model_template.START_AT_STEP, model_template.TRAINING_STEPS): print("Step {} of {}".format(i + 1, model_template.TRAINING_STEPS)) model_trainer.take_action(board, model_template.get_epsilon(i)) model_trainer.train(batch_size=model_template.BATCH, gamma=model_template.GAMMA, theta=model_template.THETA) if i % model_template.SAVE_PER_STEPS == 0:
for p, f in zip(prizes, fens): training_board = cb.ChessBoard(starting_fen=f) if not training_board.game_over(): next_moves, next_states, next_fens = training_board.get_moves() _, chosen_state, _ = choose_action(acting_model, next_moves, next_states, next_fens) estimated_next_prize = target_model.predict(np.array(chosen_state).reshape((1, 1, 384)))[0] reinforced_p = p + gamma * estimated_next_prize else: reinforced_p = p reinforced_prizes.append(reinforced_p) states = np.array(states) reinforced_prizes = np.array(reinforced_prizes) acting_model.train_on_batch(states, reinforced_prizes) memory = SimpleMemory(int(1e+5)) model_trainer = DQNTrainer(model, memory, action, training) board = cb.ChessBoard() TRAINING_STEPS = int(2e+5) for i in range(TRAINING_STEPS): print("Step {} of {}".format(i+1, TRAINING_STEPS)) model_trainer.take_action(board, 0.3) model_trainer.train(batch_size=32, gamma=0.99, theta=0.005) if i % 1000 == 0: model_trainer.save("./tmp_model.h5") model_trainer.save("./model.h5")