def test_boardenv_move_logic_four_in_a_row(): # make sure the behavior is correct when a row is full of same values. init_state = [ [2.0, 2.0, 2.0, 2.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) assert np.array_equal(init_state, b.state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert reward == 8 assert state[0, 2] == 4 and state[0, 3] == 4, b.state state, reward, done, _ = b.step(BoardEnv.RIGHT) assert reward >= 8 assert state[0, 3] == 8, b.state
def test_board_env_step_three(): init_state = [ [2.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert state[0, 3] == 2.0 and state[2, 3] == 2.0, state
def test_boardenv_done_logic(): init_state = [ [16.0, 8.0, 16.0, 4.0], [4.0, 2.0, 4.0, 8.0], [32.0, 2.0, 32.0, 4.0], [4.0, 16.0, 4.0, 8.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert not done and np.array_equal(state, np.array(init_state)) assert reward == 0 state, reward, done, _ = b.step(BoardEnv.RIGHT) assert not done and np.array_equal(state, np.array(init_state)) assert reward == 0 state, reward, done, _ = b.step(BoardEnv.LEFT) assert not done and np.array_equal(state, np.array(init_state)) assert reward == 0 state, reward, done, _ = b.step(BoardEnv.DOWN) assert done, state assert reward == 4
def test_board_env_step_one(): # make sure the behavior is correct when a row is full of same values. init_state = [ [2.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert state[0, 3] == 2.0 and state[2, 3] == 2.0
def test_board_env_step_two(): init_state = [ [4.0, 2.0, 2.0, 4.0], [0.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.RIGHT) assert state[0, 1] == 4.0 assert state[0, 2] == 4.0 assert state[0, 3] == 4.0
def test_boardenv_fill_on_move_logic(): # make sure a new piece is added that is either a 2 or a 4 init_state = [ [2.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) state, reward, done, _ = b.step(BoardEnv.LEFT) num_non_zero_spots = (b.state != 0).sum().sum() assert num_non_zero_spots == 2, state
def test_boardenv_move_logic_three_in_a_row(): # make sure the behavior is correct when 3 elts are same in a row init_state = [ [0.0, 2.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], ] b = BoardEnv().from_init_state(init_state) assert np.array_equal(init_state, b.state) state, reward, done, _ = b.step(BoardEnv.DOWN) assert reward == 4 assert state[3, 1] == 4 and state[2, 1] == 2, b.state
class Andy2048(Base2048): info = "Andy's implementation of 2048" UP = BoardEnv.UP RIGHT = BoardEnv.RIGHT DOWN = BoardEnv.DOWN LEFT = BoardEnv.LEFT @classmethod def from_init_state(cls, init_state): andy_wrapper = cls() if isinstance(init_state, list) and len(init_state) == 16: init_state = np.array(init_state).reshape((4, 4)) andy_wrapper.andy = BoardEnv.from_init_state(init_state) return andy_wrapper def __init__(self, random_seed=None): self.andy = BoardEnv(random_seed=random_seed) @property def board(self): board = [] for row in self.andy.state: for el in row: board.append(int(el)) return board @property def score(self): return self.andy.value @property def action_space(self): return self.andy.action_space def step(self, direction): _, reward, _, c = self.andy.step(direction) return self.board, reward, self.andy.done, c def get_state(self): return self.board, self.score, self.andy.done def set_board(self, board): self.andy.state = np.array(board[:]).reshape(4, 4)
q_model.build(input_shape=(1, 16)) optimizer = keras.optimizers.Adam(lr=params["learning_rate"]) p_loss_fn = keras.losses.CategoricalCrossentropy() b = BoardEnv() done = False for episode_num in range(params["num_episodes"]): state = b.reset() action_probs = tf.squeeze(p_model(state[np.newaxis]), axis=0) dice_roll = tfp.distributions.Multinomial(total_count=1, probs=action_probs).sample(1) action = b.action_space[np.argmax(dice_roll)] game_score = 0 for step_num in range(params["max_steps_per_episode"]): # compute s' next_state, reward, done, _ = b.step(action) if np.array_equal(next_state, state): # don't keep trying dud moves break # compute a' and grad log pi(a'|s') with tf.GradientTape() as p_tape: action_probs = tf.squeeze(p_model(next_state[np.newaxis]), axis=0) dice_roll = tfp.distributions.Multinomial( total_count=1, probs=action_probs).sample(1) p_loss = p_loss_fn(dice_roll, action_probs) p_grads = p_tape.gradient(p_loss, p_model.trainable_variables) next_action = b.action_space[np.argmax(dice_roll)] # compute q(s,a), q(s',a') and update q_model with tf.GradientTape() as q_tape: q_val = tf.squeeze(q_model(state[np.newaxis]))[action]