def test_board_env_step_three(): init_state = [ [2.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert state[0, 3] == 2.0 and state[2, 3] == 2.0, state
def test_board_env_step_one(): # make sure the behavior is correct when a row is full of same values. init_state = [ [2.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert state[0, 3] == 2.0 and state[2, 3] == 2.0
def test_board_env_step_two(): init_state = [ [4.0, 2.0, 2.0, 4.0], [0.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert state[0, 1] == 4.0 assert state[0, 2] == 4.0 assert state[0, 3] == 4.0
def test_boardenv_fill_on_move_logic(): # make sure a new piece is added that is either a 2 or a 4 init_state = [ [2.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.LEFT) num_non_zero_spots = (b.state != 0).sum().sum() assert num_non_zero_spots == 2, state
def test_boardenv_move_logic_three_in_a_row(): # make sure the behavior is correct when 3 elts are same in a row init_state = [ [0.0, 2.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) assert np.array_equal(init_state, b.state) state, reward, done, _ = b.step(BoardEnv.DOWN) assert reward == 4 assert state[3, 1] == 4 and state[2, 1] == 2, b.state
def test_boardenv_move_logic_four_in_a_row(): # make sure the behavior is correct when a row is full of same values. init_state = [ [2.0, 2.0, 2.0, 2.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) assert np.array_equal(init_state, b.state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert reward == 8 assert state[0, 2] == 4 and state[0, 3] == 4, b.state state, reward, done, _ = b.step(BoardEnv.RIGHT) assert reward >= 8 assert state[0, 3] == 8, b.state
def test_boardenv_init(): board_width = random.randint(4, 10) num_filled_init = random.randint(0, 4) b = BoardEnv(width=board_width, init_spots_filled=num_filled_init) num_non_zero_spots = (b.state != 0).sum().sum() assert num_non_zero_spots == num_filled_init, ( "BoardEnv initializing wrong num spots %s" % num_non_zero_spots)
class Andy2048(Base2048): info = "Andy's implementation of 2048" UP = BoardEnv.UP RIGHT = BoardEnv.RIGHT DOWN = BoardEnv.DOWN LEFT = BoardEnv.LEFT @classmethod def from_init_state(cls, init_state): andy_wrapper = cls() if isinstance(init_state, list) and len(init_state) == 16: init_state = np.array(init_state).reshape((4, 4)) andy_wrapper.andy = BoardEnv.from_init_state(init_state) return andy_wrapper def __init__(self, random_seed=None): self.andy = BoardEnv(random_seed=random_seed) @property def board(self): board = [] for row in self.andy.state: for el in row: board.append(int(el)) return board @property def score(self): return self.andy.value @property def action_space(self): return self.andy.action_space def step(self, direction): _, reward, _, c = self.andy.step(direction) return self.board, reward, self.andy.done, c def get_state(self): return self.board, self.score, self.andy.done def set_board(self, board): self.andy.state = np.array(board[:]).reshape(4, 4)
def test_boardenv_done_logic(): init_state = [ [16.0, 8.0, 16.0, 4.0], [4.0, 2.0, 4.0, 8.0], [32.0, 2.0, 32.0, 4.0], [4.0, 16.0, 4.0, 8.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert not done and np.array_equal(state, np.array(init_state)) assert reward == 0 state, reward, done, _ = b.step(BoardEnv.RIGHT) assert not done and np.array_equal(state, np.array(init_state)) assert reward == 0 state, reward, done, _ = b.step(BoardEnv.LEFT) assert not done and np.array_equal(state, np.array(init_state)) assert reward == 0 state, reward, done, _ = b.step(BoardEnv.DOWN) assert done, state assert reward == 4
def __init__(self, random_seed=None): self.andy = BoardEnv(random_seed=random_seed)
def from_init_state(cls, init_state): andy_wrapper = cls() if isinstance(init_state, list) and len(init_state) == 16: init_state = np.array(init_state).reshape((4, 4)) andy_wrapper.andy = BoardEnv.from_init_state(init_state) return andy_wrapper
mlflow.log_params(params) p_model = keras.Sequential([ keras.layers.Flatten(), keras.layers.Dense(10, activation="relu"), keras.layers.Dense(4, activation="softmax"), ]) q_model = keras.Sequential([ keras.layers.Flatten(), keras.layers.Dense(10, activation="relu"), keras.layers.Dense(4), ]) q_model.build(input_shape=(1, 16)) optimizer = keras.optimizers.Adam(lr=params["learning_rate"]) p_loss_fn = keras.losses.CategoricalCrossentropy() b = BoardEnv() done = False for episode_num in range(params["num_episodes"]): state = b.reset() action_probs = tf.squeeze(p_model(state[np.newaxis]), axis=0) dice_roll = tfp.distributions.Multinomial(total_count=1, probs=action_probs).sample(1) action = b.action_space[np.argmax(dice_roll)] game_score = 0 for step_num in range(params["max_steps_per_episode"]): # compute s' next_state, reward, done, _ = b.step(action) if np.array_equal(next_state, state): # don't keep trying dud moves break # compute a' and grad log pi(a'|s')
def test_boardenv_from_init_state(): b = BoardEnv.from_init_state([[0, 0], [2, 0]]) assert b.value == 0.0 assert np.sum(b.state) == 2 assert b.width == 2 assert b.init_spots_filled == 1