def test_get_valid_actions(): # UP, DOWN, LEFT, RIGHT is valid all_board = tuple([2, 4, 8, 16, 2, 8, 16, 32, 32, 16, 8, 4, 32, 32, 4, 8]) game = Nick2048() game.set_board(all_board) all_actions = [(a, r) for (a, r, b) in game.get_valid_actions()] assert game.board == all_board assert (game.UP, 68) in all_actions assert (game.DOWN, 68) in all_actions assert (game.RIGHT, 64) in all_actions assert (game.LEFT, 64) in all_actions for (a, r, b) in Nick2048.get_valid_actions_from_board(all_board): assert (a, r) in all_actions # No valid actions no_board = tuple([2, 4, 8, 16, 32, 64, 128, 256, 2, 4, 8, 16, 32, 64, 128, 256]) game.set_board(no_board) no_actions = [(a, r) for (a, r, b) in game.get_valid_actions()] assert game.board == no_board assert len(no_actions) == 0 for (a, r, b) in Nick2048.get_valid_actions_from_board(no_board): assert (a, r) in no_actions # DOWN or RIGHT is valid dr_board = tuple([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) game.set_board(dr_board) some_actions = [(a, r) for (a, r, b) in game.get_valid_actions()] assert game.board == dr_board assert len(some_actions) == 2 assert (game.DOWN, 0) in some_actions assert (game.RIGHT, 0) in some_actions for (a, r, b) in Nick2048.get_valid_actions_from_board(dr_board): assert (a, r) in some_actions
def _setup(self, config): self.params = config self.mlflow_client = mlflow.tracking.MlflowClient() self.mlflow_run = self.mlflow_client.create_run(experiment_id="0") self.mlflow_log_params(config) if "random_seed" in self.params: self.env = Nick2048(random_seed=self.params["random_seed"]) else: self.env = Nick2048() self.q_models = [] q_model = keras.Sequential([ keras.layers.Dense(20, activation="relu"), keras.layers.Dense(20, activation="relu"), keras.layers.Dense(20, activation="relu"), keras.layers.Dense(1), ]) for _ in range(self.env.action_space.n): self.q_models.append(keras.models.clone_model(q_model)) [ m.build(input_shape=[1, self.env.observation_space.shape[0]]) for m in self.q_models ] self.loss_fn = keras.losses.mean_squared_error self.optimizer = keras.optimizers.Adam(lr=self.params["learning_rate"]) self.memory = Memory(self.params["buffer_size"])
def test_randomness(): game1 = Nick2048() game2 = Nick2048() boards1 = _run_game(game1) boards2 = _run_game(game2) # If this fails, you either got REALLY unlucky or something is broken assert boards1 != boards2
def test_rotate_board(): # 2 0 4 8 # 2 0 0 0 # 4 4 0 0 # 0 0 0 8 board = (2, 0, 4, 8, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 8) result_90 = Nick2048.rotate_board_right(board) # 0 4 2 2 # 0 4 0 0 # 0 0 0 4 # 8 0 0 8 assert result_90 == (0, 4, 2, 2, 0, 4, 0, 0, 0, 0, 0, 4, 8, 0, 0, 8) result_180 = Nick2048.rotate_board_right(result_90) # 8 0 0 0 # 0 0 4 4 # 0 0 0 2 # 8 4 0 2 assert result_180 == (8, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 2, 8, 4, 0, 2) result_270 = Nick2048.rotate_board_right(result_180) # 8 0 0 8 # 4 0 0 0 # 0 0 4 0 # 2 2 4 0 assert result_270 == (8, 0, 0, 8, 4, 0, 0, 0, 0, 0, 4, 0, 2, 2, 4, 0) result_360 = Nick2048.rotate_board_right(result_270) assert result_360 == board
def test_get_valid_actions_by_reward(): # UP, DOWN, LEFT, RIGHT is valid board = tuple([2, 4, 4, 2, 2, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0]) game = Nick2048() game.set_board(board) action_rewards = [(a, r) for (a, r, b) in game.get_valid_actions_by_reward()] assert game.board == board left_right = [(game.LEFT, 24), (game.RIGHT, 24)] up_down = [(game.UP, 4), (game.DOWN, 4)] assert action_rewards[0] in left_right assert action_rewards[1] in left_right assert action_rewards[2] in up_down assert action_rewards[3] in up_down for (a, r, b) in Nick2048.get_valid_actions_by_reward_from_board(board): assert (a, r) in action_rewards
def test_board_env_step_two(): init_state = tuple([4, 2, 2, 4, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]) game = Nick2048() game.set_board(init_state) assert game.board == init_state state, reward, done, _ = game.step(game.RIGHT) assert game.board[3] == 4 assert game.board[2] == 4 assert game.board[1] == 4
def test_get_afterstate(): # 2 0 4 8 # 2 0 0 0 # 4 4 0 0 # 0 0 0 8 board = (2, 0, 4, 8, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 8) after_up, up_reward = Nick2048.get_afterstate(board, Nick2048.UP) assert after_up == (4, 4, 4, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) assert up_reward == 20 after_down, down_reward = Nick2048.get_afterstate(board, Nick2048.DOWN) assert after_down == (0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 4, 4, 16) assert down_reward == 20 after_left, left_reward = Nick2048.get_afterstate(board, Nick2048.LEFT) assert after_left == (2, 4, 8, 0, 2, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0) assert left_reward == 8 after_right, right_reward = Nick2048.get_afterstate(board, Nick2048.RIGHT) assert after_right == (0, 2, 4, 8, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0, 0, 8) assert right_reward == 8
def test_action_history(actions, expected_score=None, expected_tile=None): assert expected_score or expected_tile test_game = Nick2048(random_seed=SEED) for action in actions: test_game.step(action) if expected_score: assert test_game.score == expected_score if expected_tile: assert max(test_game.board) == expected_tile
def bfs_search(): search_queue = [] game = Nick2048(random_seed=SEED) update_search_queue(search_queue, game, ()) state_action_pairs = set() max_tile = 0 max_tile_history = () max_score = 0 max_score_history = () depth_start_time = time.time() curr_depth = 1 while len(search_queue) > 0: board, score, action, action_history = search_queue.pop(0) if len(action_history) >= curr_depth: depth_time = round(time.time() - depth_start_time, 1) print( f"Depth: {curr_depth}:" f"\n\tMax Tile: {max_tile} " f"({get_move_string(max_tile_history)})" f"\n\tMax Score: {max_score} " f"({get_move_string(max_score_history)})" f"\n\tTotal State Action Pairs: {len(state_action_pairs)}" f"\n\tDepth Time: {depth_time} sec" ) mlflow.log_metric("Max Tile", max_tile, step=curr_depth) mlflow.log_metric("Max Score", max_score, step=curr_depth) mlflow.log_metric( "Total State Action Pairs", len(state_action_pairs), step=curr_depth ) test_action_history(max_tile_history, expected_tile=max_tile) test_action_history(max_score_history, expected_score=max_score) max_tile = 0 max_score = 0 depth_start_time = time.time() curr_depth += 1 game.set_board(board) game.score = score game.step(action) state_action_pairs.add((board, action)) action_history = (*action_history, action) if max(game.board) > max_tile: max_tile = max(game.board) max_tile_history = action_history if game.score > max_score: max_score = game.score max_score_history = action_history update_search_queue(search_queue, game, action_history) if len(action_history) > DEPTH_LIMIT: break
def test_boardenv_fill_on_move_logic(): # make sure a new piece is added that is either a 2 or a 4 init_state = tuple([2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) game = Nick2048() game.set_board(init_state) assert game.board == init_state state, reward, done, _ = game.step(game.LEFT) assert state == game.board assert reward == 4 assert len([v for v in game.board if v != 0]) == 2
def test_boardenv_move_logic_three_in_a_row(): # make sure the behavior is correct when 3 elts are same in a row init_state = tuple([0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]) game = Nick2048() game.set_board(init_state) assert game.board == init_state state, reward, done, _ = game.step(game.DOWN) assert state == game.board assert reward == 4 assert game.board[13] == 4 assert game.board[9] == 2
def test_boardenv_move_logic_four_in_a_row(): # make sure the behavior is correct when a row is full of same values. init_state = tuple([2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) game = Nick2048() game.set_board(init_state) assert game.board == init_state state, reward, done, _ = game.step(game.RIGHT) assert reward == 8 assert game.board[2] == 4 assert game.board[3] == 4 state, reward, done, _ = game.step(game.RIGHT) assert reward == 8 assert game.board[3] == 8
def test_board_env_step_one(): init_state = tuple([2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]) game = Nick2048() game.set_board(init_state) assert game.board == init_state # Note the move will add a random 2 or 4 into the board state, reward, done, _ = game.step(game.RIGHT) assert game.board[3] == 2 assert game.board[11] == 2 nonzeros = [v for v in game.board if v != 0] assert len(nonzeros) == 3 for v in nonzeros: assert v in [2, 4]
def test_reflect_board(): # 2 0 4 8 # 2 0 0 0 # 4 4 0 0 # 0 0 0 8 board = (2, 0, 4, 8, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 8) reflected_y_1 = Nick2048.reflect_board_across_y(board) # 8 4 0 2 # 0 0 0 2 # 0 0 4 4 # 8 0 0 0 assert reflected_y_1 == (8, 4, 0, 2, 0, 0, 0, 2, 0, 0, 4, 4, 8, 0, 0, 0) reflected_y_2 = Nick2048.reflect_board_across_y(reflected_y_1) assert reflected_y_2 == board reflected_x_1 = Nick2048.reflect_board_across_x(board) # 0 0 0 8 # 4 4 0 0 # 2 0 0 0 # 2 0 4 8 assert reflected_x_1 == (0, 0, 0, 8, 4, 4, 0, 0, 2, 0, 0, 0, 2, 0, 4, 8) reflected_x_2 = Nick2048.reflect_board_across_x(reflected_x_1) assert reflected_x_2 == board
def _setup(self, config): self.params = config self.mlflow_client = mlflow.tracking.MlflowClient() self.mlflow_run = self.mlflow_client.create_run(experiment_id="0") self.mlflow_log_params(config) self.env = Nick2048() self.v_model = keras.Sequential([ keras.layers.Dense(20, activation="relu"), keras.layers.Dense(20, activation="relu"), keras.layers.Dense(20, activation="relu"), keras.layers.Dense(1), ]) self.v_model.build( input_shape=[1, self.env.observation_space.shape[0]]) self.loss_fn = keras.losses.mean_squared_error self.optimizer = keras.optimizers.Adam(lr=self.params["learning_rate"]) self.memory = Memory(self.params["buffer_size"])
def test_boardenv_done_logic(): init_state = tuple([16, 8, 16, 4, 4, 2, 4, 8, 32, 2, 32, 4, 4, 16, 4, 8]) game = Nick2048() game.set_board(init_state) assert game.board == init_state state, reward, done, _ = game.step(game.RIGHT) assert state == game.board assert state == init_state assert not done assert reward == 0 state, reward, done, _ = game.step(game.RIGHT) assert state == game.board assert state == init_state assert not done assert reward == 0 state, reward, done, _ = game.step(game.LEFT) assert state == game.board assert state == init_state assert not done assert reward == 0 state, reward, done, _ = game.step(game.DOWN) assert done assert reward == 4
def dfs_search(): start = time.time() search_stack = [] game = Nick2048(random_seed=SEED) update_search_stack(search_stack, game, []) max_score = 0 max_action_history = 0 complete_games = 0 while len(search_stack) > 0: board, score, action, action_history = search_stack.pop() action_history = action_history[:] game.set_board(board) game.score = score game.step(action) action_history.append(action) update_search_stack(search_stack, game, action_history) if game.done: complete_games += 1 if game.score > max_score: max_score = game.score max_action_history = action_history[:] max_elapsed_time = time.time() - start if complete_games % 1001 == 1000: print( f"Random seed: {SEED}\n" f"Max action history: {max_action_history}\n" f"Max Score: {max_score}\n" f"Max moves: {len(max_action_history)}\n" f"Max found after: {round(max_elapsed_time, 2)} sec\n" f"Total elapsed time: {round(time.time() - start, 2)} sec\n" f"Search stack size: {len(search_stack)}\n" f"Complete Games: {complete_games}\n" f"Complete Games: {complete_games}\n" f"Current Score: {game.score}\n") test_action_history(action_history, game.score) test_action_history(max_action_history, max_score)
def play_nick_version(): game = Nick2048() run_manual_loop(game)
def test_boardenv_init(): game = Nick2048() nonzero = [v for v in game.board if v != 0] assert len(nonzero) == 2 for v in nonzero: assert v in [2, 4]
def test_set_board_makes_copy(): init_state = tuple([2, 2, 0, 0, 2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0]) game = Nick2048() game.set_board(init_state) assert game.board == init_state
def play_with_seed(seed): game = Nick2048(random_seed=seed) run_manual_loop(game)
def _train(self): with mlflow.start_run(): mlflow.log_params(self.params) optimizer = keras.optimizers.Adam(lr=self.params["learning_rate"]) train_acc_metric = keras.metrics.SparseCategoricalAccuracy() game_scores = [] game_num_steps = [] b = Nick2048() for episode_num in range(self.params["num_episodes"]): state = b.reset() game_score = 0 # Pseudo code for our Sarsa learning algo: # for each step in the rollout: # action = fancy_argmax_a(q(get_afterstate(s),a)) # q_val = model(get_afterstate(s),action) # next_s, r = b.step(a) # next_action = fancy_argmax_a(q(get_afterstate(next_s), a)) # next_q_val = model(get_afterstate(new_s),next_action) # update q_model using loss(q_val - (r + next_q_val) for step_num in range(self.params["max_steps_per_episode"]): with tf.GradientTape() as q_tape: logging.debug( f"state:\n{np.asarray(state).reshape([4,4])}") candidate_actions = list(range(b.action_space.n)) canonical_afterstates = [ b.get_canonical_board( b.get_afterstate(state, a)[0]) for a in candidate_actions ] q_vals = [ tf.squeeze(self.q_models[i](np.array( canonical_afterstates[i])[np.newaxis])) for i in candidate_actions ] logging.debug(f"q_vals : {q_vals}") # pick action by rolling dice according to relative values of canonical_afterstates while True: dice_roll = tfp.distributions.Multinomial( total_count=5, probs=softmax(q_vals)).sample(1) action_index = np.argmax(dice_roll) action = candidate_actions[action_index] next_state, reward, done, _ = b.step(action) if next_state != state: # you found a valid move break else: # that wasn't a valid move, but one must exist since we weren't done after last step. logging.debug( f"action {action} was invalid, removing it from candidate and rolling dice again" ) assert ( len(candidate_actions) > 1 ), "No actions changed the board but we are not done." a_idx_pp = action_index + 1 q_vals = q_vals[:action_index] + q_vals[a_idx_pp:] candidate_actions = ( candidate_actions[:action_index] + candidate_actions[a_idx_pp:]) logging.debug(f"action: {action}") logging.debug( f"canonical_afterstate:\n{np.asarray(canonical_afterstates[action]).reshape([4,4])}" ) q_val = q_vals[action_index] logging.debug(f"q_val: {q_val}") logging.debug(f"reward: {reward}") logging.debug( f"next_state:\n{np.asarray(next_state).reshape([4,4])}" ) # update q_model via TD learning using q(s,a) (which we computed last loop iter) and q(s',a') next_candidate_actions = list(range(b.action_space.n)) next_canonical_afterstates = [ b.get_canonical_board( b.get_afterstate(next_state, action)[0]) for action in next_candidate_actions ] next_q_vals = [ tf.squeeze(self.q_models[i](np.array( next_canonical_afterstates[i])[np.newaxis])) for i in next_candidate_actions ] logging.debug(f"next_q_vals: {next_q_vals}") next_action = np.argmax(next_q_vals) logging.debug(f"next_action: {next_action}") next_q_val = next_q_vals[next_action] target_q_val = ( reward + (1 - done) * self.params["alpha"] * next_q_val) logging.debug(f"next_q_val: {next_q_val}") logging.debug(f"target_q_val: {target_q_val}") val_loss = tf.math.square(q_val - target_q_val) print(f"loss: {val_loss}") val_grads = q_tape.gradient( val_loss, self.q_models[action].trainable_variables) optimizer.apply_gradients( zip(val_grads, self.q_models[action].trainable_variables)) train_acc_metric.update_state(action, q_vals) print(f"q_val before gradient step: {q_val}") print(f"target_q_val: {target_q_val}") q_for_print = np.squeeze(self.q_models[action](np.array( canonical_afterstates[action])[np.newaxis])) print(f"q_val after gradient step: {q_for_print}") print() logging.debug("\n") # get ready to loop state = next_state game_score += reward if done: break print( f"accuracy in episode {episode_num}: {train_acc_metric.result().numpy()}" ) train_acc_metric.reset_states() game_scores.append(game_score) game_num_steps.append(step_num + 1) avg_game_score = np.mean(game_scores) avg_last_10 = np.mean(game_scores[-10:]) print( "%s steps in episode %s, score: %s, running_avg: %.0f, avg_last_10_games: %.0f" % ( step_num + 1, episode_num, game_score, avg_game_score, avg_last_10, )) # mlflow.log_metric("game scores", game_score, step=episode_num) # mlflow.log_metric("avg game score", avg_game_score, step=episode_num) # mlflow.log_metric("avg_score_last_10", avg_last_10) # mlflow.log_metric("game num steps", step_num + 1, step=episode_num) # mlflow.log_metric( # "avg num steps", np.mean(game_num_steps), step=episode_num # ) return { "avg_game_score": avg_game_score, "avg_num_steps": np.mean(game_num_steps), "episodes_total": episode_num + 1, "timesteps_total": np.sum(game_num_steps), }
def test_no_randomness(): game1 = Nick2048(random_seed=13) game2 = Nick2048(random_seed=13) boards1 = _run_game(game1) boards2 = _run_game(game2) assert boards1 == boards2
# logging.basicConfig(level=logging.DEBUG) start_time = time.time() with mlflow.start_run(): max_depth = 15 assert max_depth > 0 num_random_seeds = 100 max_max_tile = [] max_score = [] total_state_action_pairs = [] for rand_seed in range(num_random_seeds): max_max_tile.append([0] * (max_depth + 1)) max_score.append([0] * (max_depth + 1)) total_state_action_pairs.append([0] * (max_depth + 1)) state_action_scores = {} env = Nick2048(random_seed=rand_seed) actions = range(env.action_space.n) state_actions = ( deque() ) # queue of (depth, game_score, max_tile, state, next_action) init_state = env.get_state()[0] for a in actions: state_actions.append( (1, 0, max(init_state), init_state, a)) # push initial actions while state_actions: debug_str = "" t = state_actions.popleft() debug_str += f"handling {t}\n" depth, game_score, max_tile, state, next_action = t
def test_get_canonical(): for i in range(100): board = _generate_random_board() canonical = Nick2048.get_canonical_board(board) r90 = Nick2048.rotate_board_right(board) r180 = Nick2048.rotate_board_right(r90) r270 = Nick2048.rotate_board_right(r180) r360 = Nick2048.rotate_board_right(r270) xr0 = Nick2048.reflect_board_across_x(board) xr90 = Nick2048.rotate_board_right(xr0) xr180 = Nick2048.rotate_board_right(xr90) xr270 = Nick2048.rotate_board_right(xr180) xr360 = Nick2048.rotate_board_right(xr270) yr0 = Nick2048.reflect_board_across_y(board) yr90 = Nick2048.rotate_board_right(yr0) yr180 = Nick2048.rotate_board_right(yr90) yr270 = Nick2048.rotate_board_right(yr180) yr360 = Nick2048.rotate_board_right(yr270) assert canonical == Nick2048.get_canonical_board(r90) assert canonical == Nick2048.get_canonical_board(r180) assert canonical == Nick2048.get_canonical_board(r270) assert canonical == Nick2048.get_canonical_board(r360) assert canonical == Nick2048.get_canonical_board(xr0) assert canonical == Nick2048.get_canonical_board(xr90) assert canonical == Nick2048.get_canonical_board(xr180) assert canonical == Nick2048.get_canonical_board(xr270) assert canonical == Nick2048.get_canonical_board(xr360) assert canonical == Nick2048.get_canonical_board(yr0) assert canonical == Nick2048.get_canonical_board(yr90) assert canonical == Nick2048.get_canonical_board(yr180) assert canonical == Nick2048.get_canonical_board(yr270) assert canonical == Nick2048.get_canonical_board(yr360)
def play_with_lookahead(): game = Nick2048() lookahead_fn = get_lookahead_fn(Nick2048, 5) run_manual_loop(game, lookahead_fn)
def test_set_board(): board = tuple([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]) game = Nick2048() game.set_board(board) assert game.board == board
# requires PYTHONPATH to contain the top-level directory # i.e. the improved-funicular you checked out from github or run: # $ PYTHONPATH=. python tests/perf_benchmarks.py # # Alternatively, add a pth file to site-packages, for example # $ echo \`pwd\`/ > improved-funicular/lib/python3.7/site-packages/curr_dir.pth import time from envs.nick_2048 import Nick2048 from strategies.random import try_random board = [2, 0, 8, 16, 2, 4, 8, 4, 2, 0, 2, 2, 4, 0, 0, 0] game = Nick2048() start = time.time() # Initial implementation: .39sec # With squash lookup table: .22sec for i in range(10000): game.set_board(board) game.step(game.UP) end = time.time() print(f"Time to set board and step: {end-start}") start = time.time() rollouts = 100 # Initial (with squash table): .35sec try_random(Nick2048, rollouts) end = time.time()
def test_action_history(action_history, expected_score): test_game = Nick2048(random_seed=SEED) for action in action_history: test_game.step(action) assert test_game.done assert test_game.score == expected_score