def naive_game_step(board): valid_next_board = [] up_board = Board2048(board) res = up_board.up() if res: valid_next_board.append(up_board) down_board = Board2048(board) res = down_board.down() if res: valid_next_board.append(down_board) left_board = Board2048(board) res = left_board.left() if res: valid_next_board.append(left_board) right_board = Board2048(board) res = right_board.right() if res: valid_next_board.append(right_board) if not valid_next_board: return False, None else: next_board = max(valid_next_board, key=lambda x: naive_score(x)) return True, next_board
def generate_replay_buffer_using_A_star(batch_size, maxlen): from tqdm import tqdm #-- Set a max replay buffer size --# replay_buffer = deque(maxlen=maxlen) #-- Mapping of action to int --# move = {'up': 0, 'down': 1, 'left': 2, 'right': 3} #-- Run a_star for each batch --# for _ in tqdm(range(batch_size)): b = Board2048() current_node = A_star(b)['current_node'] print(current_node, current_node.parent) #-- Trace back to parent nodes and register all moves --# while current_node.parent != None: current = current_node parent = current_node.parent done = int(current.is_root()) action = move[current.move] reward = reward_func_merge_score(current.board, parent.board, action, done) #-- Append games to replay_buffer --# replay_buffer.append( (current.board, action, reward, current.board, done)) current_node = current_node.parent return replay_buffer
def basic_upleft_algorithm(self, k=4): board = Board2048(k=k) simple_score = board.simple_score() single_game_history = [] while True: board = board.peek_action("up") single_game_history.append( (board.state, 'up', board.simple_score(), board.merge_score())) board = board.peek_action("left") single_game_history.append( (board.state, 'left', board.simple_score(), board.merge_score())) if simple_score == board.simple_score(): board = board.peek_action('down') single_game_history.append( (board.state, 'down', board.simple_score(), board.merge_score())) board = board.peek_action('right') single_game_history.append( (board.state, 'r', board.simple_score(), board.merge_score())) if simple_score == board.simple_score(): break simple_score = board.simple_score() self.games_history.append(single_game_history) return board
def play_game(self, random_policy=False): board = Board2048() done = False single_game_history = [] while not done: available_moves = board.available_moves_as_torch_unit_vector( device=self.device) done = torch.max(available_moves) == 0 state = board.normalized().state_as_4d_tensor().to(self.device) if not random_policy: Q_values = self.model(state) else: Q_values = torch.rand((4, ), device=self.device) available_Q_values = available_moves * Q_values next_action = torch.argmax(available_Q_values) next_board = board.peek_action(next_action) reward = self.reward_func(board, next_board, next_action, done) merge_score = board.merge_score() single_game_history.append( (board.state, ['u', 'd', 'l', 'r'][int(next_action)], reward, merge_score)) board = next_board self.games_history.append(single_game_history) return single_game_history
def search_best_step(root_node, step=3): stack = [root_node] for _ in range(step): new_stack = [] for game_node in stack: up_board = Board2048(game_node.board) res = up_board.up() if res: new_game_node = GameNode(up_board) game_node.children.append(new_game_node) new_stack.append(new_game_node) down_board = Board2048(game_node.board) res = down_board.down() if res: new_game_node = GameNode(down_board) game_node.children.append(new_game_node) new_stack.append(new_game_node) left_board = Board2048(game_node.board) res = left_board.left() if res: new_game_node = GameNode(left_board) game_node.children.append(new_game_node) new_stack.append(new_game_node) right_board = Board2048(game_node.board) res = right_board.right() if res: new_game_node = GameNode(right_board) game_node.children.append(new_game_node) new_stack.append(new_game_node) stack = new_stack best_child_board, best_score = None, -1 for child_board in root_node.children: score = child_board.score() if score > best_score: best_child_board = child_board best_score = score return best_child_board
def __init__(self, initial_grid, player_mode, game_mode, method_idx, play_turn=True): self.board = Board2048(grid=initial_grid, player_turn=play_turn, score=0) self.player_mode = player_mode self.game_mode = game_mode self.method = METHODS[method_idx] print(self.board)
def naive_game_play(): # Init game play board = Board2048() board.add_num() board.add_num() board.pprint() while True: alive, board = naive_game_step(board) if not alive: break board.pprint() board.add_num() board.pprint() input()
def search_game_play(): # Init game play board = Board2048() board.add_num() board.add_num() board.pprint() root_node = GameNode(board) while True: root_node = search_best_step(root_node) if not root_node: break root_node.board.pprint() root_node.board.add_num() root_node.board.pprint() input()
def check_valid_board(board, move): if (move == "left"): oldgrid = board.grid_ newgrid, score_added = move_grid_helper(oldgrid) elif (move == "up"): oldgrid = np.rot90(board.grid_, 1) newgrid, score_added = move_grid_helper(oldgrid) newgrid = np.rot90(newgrid, -1) elif (move == "right"): oldgrid = np.rot90(board.grid_, 2) newgrid, score_added = move_grid_helper(oldgrid) newgrid = np.rot90(newgrid, 2) else: oldgrid = np.rot90(board.grid_, -1) newgrid, score_added = move_grid_helper(oldgrid) newgrid = np.rot90(newgrid, 1) return Board2048(newgrid, True, board.score + score_added)
def play_one_step(board: Board2048, epsilon: float, model: torch.nn.Sequential, replay_buffer: deque, device: str, reward_function: Callable = reward_func_merge_score, board_to_tensor_function: Callable = board_as_4d_tensor): action, done, max_q_value = epsilon_greedy_policy( board, epsilon=epsilon, model=model, device=device, board_to_tensor_function=board_to_tensor_function) next_board = board.peek_action(action) reward = reward_function(board, next_board, action, done) replay_buffer.append((board, action, reward, next_board, done)) return next_board, action, reward, done, max_q_value
else: initial_grid_list = InitialStates(shape).generate(1) csvfile = open("log_size{0}*{1}_24_static.csv".format(shape[0], shape[1]), "w") writer = csv.writer(csvfile) writer.writerow([ 'initial state', 'end state', 'score', 'max value', 'time(s)', 'number of nodes' ]) i = 0 for initial_grid in initial_grid_list: #[58:59]: #58 True 10 false i += 1 print( "############################################ New Game ##################################################", i) board = Board2048(grid=initial_grid, player_turn=True, score=0) print("initial board:", board) cnt_nodes = 0 try: with open('{0}*{1}_data_.pkl'.format(shape[0], shape[1]), 'rb') as data_file: heuristic_table = pickle.load(data_file) print(len(heuristic_table)) except: heuristic_table = {} for depth_limit in range(15, 50, 5): cnt_nodes_it = 0 is_complete = True trans_table = {}
def __init__(self, board): self.board = Board2048(board) self.children = []
def reward_func_merge_score(board: Board2048, next_board: Board2048, action: int, done: int) -> int: return next_board.merge_score() - board.merge_score()
def board_as_4d_tensor(board: Board2048, device: str) -> torch.tensor: return board.log_scale().state_as_4d_tensor().to(device)
def training_loop(replay_buffer_length, no_episodes, no_episodes_to_reach_epsilon, no_episodes_to_fill_up_existing_model_replay_buffer, min_epsilon, model, reward_function, board_to_tensor_function, device, experiment, snapshot_game_every_n_episodes, no_episodes_before_training, batch_size, discount_factor, target_model, loss_fn, optimizer, use_double_dqn, no_episodes_before_updating_target, extract_samples_function, replay_buffer_override=None): try: if replay_buffer_override: replay_buffer = replay_buffer_override else: replay_buffer = deque(maxlen=replay_buffer_length) for ep in range(no_episodes): print(ep) board = Board2048() done = False board_history = [] rewards = [] q_values = [] epsilon = None while not done: # value to determine how greedy the policy should be for that step epsilon = max((no_episodes_to_reach_epsilon - ep) / no_episodes_to_reach_epsilon, min_epsilon) if ep < no_episodes_to_fill_up_existing_model_replay_buffer: epsilon = 0 new_board, action, reward, done, max_q_value = play_one_step( board, epsilon, model, replay_buffer, reward_function=reward_function, board_to_tensor_function=board_to_tensor_function, device=device) board_history.append((board.state, ['u', 'd', 'l', 'r'][int(action)], reward)) rewards.append(reward) q_values.append(float(max_q_value)) board = new_board mean_of_rewards = np.mean(np.array(rewards)) mean_of_q_values = np.mean(np.array(q_values)) experiment.add_episode(board, epsilon, ep, mean_of_rewards, mean_of_q_values) if ep % snapshot_game_every_n_episodes == 0: experiment.snapshot_game(board_history, ep) if ep % 10 == 0: print( f"Episode: {ep}: {board.merge_score()}, {np.max(board.state.flatten())}, {len(board._action_history)}" ) if ep > no_episodes_before_training: train_step(batch_size, discount_factor, model, target_model, replay_buffer, loss_fn, optimizer, device=device, use_double_dqn=use_double_dqn, board_to_tensor_function=board_to_tensor_function, extract_samples_function=extract_samples_function) if ep % no_episodes_before_updating_target == 0 and ep >= no_episodes_to_fill_up_existing_model_replay_buffer: target_model.load_state_dict(copy.deepcopy(model.state_dict())) if ep % 1000 == 0: experiment.save() print("Saved game") experiment.save() except KeyboardInterrupt as e: print(e) print( f'\nKeyboard interrut caught. Saving current experiment in {experiment.folder}' ) experiment.save() except Exception as e: experiment.save() print(f'\nSaving current experiment in {experiment.folder}\n') raise e