コード例 #1
0
        def run_and_get_new_weights(init_weights, win0, win1):
            state = GameState(size=19)
            policy = CNNPolicy.load_model(
                os.path.join('test_data', 'minimodel.json'))
            policy.model.set_weights(init_weights)
            optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2)
            policy.model.compile(loss=log_loss, optimizer=optimizer)

            # Make moves on the state and get trainable (state, action) pairs from them.
            moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)]
            state_tensors = []
            action_tensors = []
            for m in moves:
                (st_tensor,
                 mv_tensor) = _make_training_pair(state, m,
                                                  policy.preprocessor)
                state_tensors.append(st_tensor)
                action_tensors.append(mv_tensor)
                state.do_move(m)

            for i, (s, a) in enumerate(zip(state_tensors, action_tensors)):
                # Put even state/action pairs in game 0, odd ones in game 1.
                game_idx = i % 2
                optimizer.set_current_game(game_idx)
                is_last_move = i + 2 >= len(moves)
                if is_last_move:
                    if game_idx == 0:
                        optimizer.set_result(game_idx, win0)
                    else:
                        optimizer.set_result(game_idx, win1)
                # train_on_batch accumulates gradients, and should only cause a change to parameters
                # on the first call after the final set_result() call
                policy.model.train_on_batch(s, a)
            return policy.model.get_weights()
コード例 #2
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
    def test_positional_superko(self):
        move_list = [(0, 3), (0, 4), (1, 3), (1, 4), (2, 3), (2, 4), (2, 2),
                     (3, 4), (2, 1), (3, 3), (3, 1), (3, 2), (3, 0), (4, 2),
                     (1, 1), (4, 1), (8, 0), (4, 0), (8, 1), (0, 2), (8, 2),
                     (0, 1), (8, 3), (1, 0), (8, 4), (2, 0), (0, 0)]

        #   0 1 2 3 4 5 6 7 8 9
        # 0 . W W B W . . . . .
        # 1 W B . B W . . . . .
        # 2 W B B B W . . . . .
        # 3 B B W W W . . . . .
        # 4 W W W . . . . . . .
        # 5 . . . . . . . . . .
        # 6 . . . . . . . . . .
        # 7 . . . . . . . . . .
        # 8 B B B B B . . . . .
        # 9 . . . . . . . . . .

        gs = GameState(size=9)
        for move in move_list:
            gs.do_move(move)
        self.assertTrue(gs.is_legal((1, 0)))

        gs = GameState(size=9, enforce_superko=True)
        for move in move_list:
            gs.do_move(move)
        self.assertFalse(gs.is_legal((1, 0)))
コード例 #3
0
ファイル: test_policy.py プロジェクト: BismarckDD/BetaGo
    def test_output_size(self):
        policy19 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=19)
        output = policy19.forward(policy19.preprocessor.state_to_tensor(GameState(19)))
        self.assertEqual(output.shape, (1, 19 * 19))

        policy13 = CNNPolicy(["board", "liberties", "sensibleness", "capture_size"], board=13)
        output = policy13.forward(policy13.preprocessor.state_to_tensor(GameState(13)))
        self.assertEqual(output.shape, (1, 13 * 13))
コード例 #4
0
ファイル: test_policy.py プロジェクト: BismarckDD/BetaGo
 def test_probabilistic_player(self):
     gs = GameState()
     policy = CNNPolicy(["board", "ones", "turns_since"])
     player = ProbabilisticPolicyPlayer(policy)
     for i in range(20):
         move = player.get_move(gs)
         self.assertIsNotNone(move)
         gs.do_move(move)
コード例 #5
0
    def testApplyAndResetOnGamesFinished(self):
        policy = CNNPolicy.load_model(
            os.path.join('test_data', 'minimodel.json'))
        state = GameState(size=19)
        optimizer = BatchedReinforcementLearningSGD(lr=0.01, ng=2)
        policy.model.compile(loss=log_loss, optimizer=optimizer)

        # Helper to check initial conditions of the optimizer.
        def assertOptimizerInitialConditions():
            for v in optimizer.gradient_sign:
                self.assertEqual(K.eval(v), 0)
            self.assertEqual(K.eval(optimizer.running_games), 2)

        initial_parameters = policy.model.get_weights()

        def assertModelEffect(changed):
            any_change = False
            for cur, init in zip(policy.model.get_weights(),
                                 initial_parameters):
                if not np.allclose(init, cur):
                    any_change = True
                    break
            self.assertEqual(any_change, changed)

        assertOptimizerInitialConditions()

        # Make moves on the state and get trainable (state, action) pairs from them.
        state_tensors = []
        action_tensors = []
        moves = [(2, 2), (16, 16), (3, 17), (16, 2), (4, 10), (10, 3)]
        for m in moves:
            (st_tensor,
             mv_tensor) = _make_training_pair(state, m, policy.preprocessor)
            state_tensors.append(st_tensor)
            action_tensors.append(mv_tensor)
            state.do_move(m)

        for i, (s, a) in enumerate(zip(state_tensors, action_tensors)):
            # Even moves in game 0, odd moves in game 1
            game_idx = i % 2
            optimizer.set_current_game(game_idx)
            is_last_move = i + 2 >= len(moves)
            if is_last_move:
                # Mark game 0 as a win and game 1 as a loss.
                optimizer.set_result(game_idx, game_idx == 0)
            else:
                # Games not finished yet; assert no change to optimizer state.
                assertOptimizerInitialConditions()
            # train_on_batch accumulates gradients, and should only cause a change to parameters
            # on the first call after the final set_result() call
            policy.model.train_on_batch(s, a)
            if i + 1 < len(moves):
                assertModelEffect(changed=False)
            else:
                assertModelEffect(changed=True)
        # Once both games finished, the last call to train_on_batch() should have triggered a reset
        # to the optimizer parameters back to initial conditions.
        assertOptimizerInitialConditions()
コード例 #6
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
 def test_eye_recursion(self):
     # a checkerboard pattern of black is 'technically' all true eyes
     # mutually supporting each other
     gs = GameState(7)
     for x in range(gs.size):
         for y in range(gs.size):
             if (x + y) % 2 == 1:
                 gs.do_move((x, y), go.BLACK)
     self.assertTrue(gs.is_eye((0, 0), go.BLACK))
コード例 #7
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
 def test_snapback_is_not_ko(self):
     gs = GameState(size=5)
     # B X W B .
     # W W B . .
     # . . . . .
     # . . . . .
     # . . . . .
     # imagine black plays at 'X' capturing the white stone at (2, 0).
     # White may play again at (2, 0) to capture the black stones
     # at (0, 0), (1, 0). this is a 'snapback' not 'ko'
     # since it doesn't return the game to a previous position
     B = [(0, 0), (2, 1), (3, 0)]
     W = [(0, 1), (1, 1), (2, 0)]
     for (b, w) in zip(B, W):
         gs.do_move(b)
         gs.do_move(w)
     # do the capture of the single white stone
     gs.do_move((1, 0))
     # there should be no ko
     self.assertIsNone(gs.ko)
     self.assertTrue(gs.is_legal((2, 0)))
     # now play the snapback
     gs.do_move((2, 0))
     # check that the numbers worked out
     self.assertEqual(gs.num_black_prisoners, 2)
     self.assertEqual(gs.num_white_prisoners, 1)
コード例 #8
0
ファイル: test_policy.py プロジェクト: BismarckDD/BetaGo
 def test_sensible_greedy(self):
     gs = GameState()
     policy = CNNPolicy(["board", "ones", "turns_since"])
     player = GreedyPolicyPlayer(policy)
     empty = (10, 10)
     for x in range(19):
         for y in range(19):
             if (x, y) != empty:
                 gs.do_move((x, y), go.BLACK)
     gs.current_player = go.BLACK
     self.assertIsNone(player.get_move(gs))
コード例 #9
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
    def test_copy_maintains_shared_sets(self):
        gs = GameState(7)
        gs.do_move((4, 4), go.BLACK)
        gs.do_move((4, 5), go.BLACK)

        # assert that gs has *the same object* referenced by group/liberty sets
        self.assertTrue(gs.group_sets[4][5] is gs.group_sets[4][4])
        self.assertTrue(gs.liberty_sets[4][5] is gs.liberty_sets[4][4])

        gs_copy = gs.copy()
        self.assertTrue(gs_copy.group_sets[4][5] is gs_copy.group_sets[4][4])
        self.assertTrue(
            gs_copy.liberty_sets[4][5] is gs_copy.liberty_sets[4][4])
コード例 #10
0
def parse(boardstr):
    '''Parses a board into a gamestate, and returns the location of any moves
    marked with anything other than 'B', 'X', '#', 'W', 'O', or '.'

    Rows are separated by '|', spaces are ignored.

    '''

    boardstr = boardstr.replace(' ', '')
    board_size = max(boardstr.index('|'), boardstr.count('|'))

    st = GameState(size=board_size)
    moves = {}

    for row, rowstr in enumerate(boardstr.split('|')):
        for col, c in enumerate(rowstr):
            if c == '.':
                continue  # ignore empty spaces
            elif c in 'BX#':
                st.do_move((row, col), color=BLACK)
            elif c in 'WO':
                st.do_move((row, col), color=WHITE)
            else:
                # move reference
                assert c not in moves, "{} already used as a move marker".format(
                    c)
                moves[c] = (row, col)

    return st, moves
コード例 #11
0
ファイル: test_liberties.py プロジェクト: BismarckDD/BetaGo
    def setUp(self):

        #   0 1 2 3 4 5 6 7 8 9 A B
        # 0 . . . . . . . . . . . .
        # 1 . . . . . . . . . . . .
        # 2 . . . . . . . . . . . .
        # 3 . . . . . . . . . . . .
        # 4 . . . . . B B . . . . .
        # 5 . . . . . W B . . . . .
        # 6 . . . . . . B . . . . .
        # 7 . . . . . . . . . . . .
        # 8 . . . . . . . . . . . .
        # 9 . . . . . . . . . . W .
        # A . . . . . . . . . . W W
        # B . . . . . . . . . . . .
        self.s = GameState()
        self.s.do_move((4, 5))
        self.s.do_move((5, 5))
        self.s.do_move((5, 6))
        self.s.do_move((10, 10))
        self.s.do_move((4, 6))
        self.s.do_move((10, 11))
        self.s.do_move((6, 6))
        self.s.do_move((9, 10))
コード例 #12
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
    def test_liberties_after_capture(self):
        # creates 3x3 black group in the middle, that is then all captured
        # ...then an assertion is made that the resulting liberties after
        # capture are the same as if the group had never been there
        gs_capture = GameState(7)
        gs_reference = GameState(7)
        # add in 3x3 black stones
        for x in range(2, 5):
            for y in range(2, 5):
                gs_capture.do_move((x, y), go.BLACK)
        # surround the black group with white stones
        # and set the same white stones in gs_reference
        for x in range(2, 5):
            gs_capture.do_move((x, 1), go.WHITE)
            gs_capture.do_move((x, 5), go.WHITE)
            gs_reference.do_move((x, 1), go.WHITE)
            gs_reference.do_move((x, 5), go.WHITE)
        gs_capture.do_move((1, 1), go.WHITE)
        gs_reference.do_move((1, 1), go.WHITE)
        for y in range(2, 5):
            gs_capture.do_move((1, y), go.WHITE)
            gs_capture.do_move((5, y), go.WHITE)
            gs_reference.do_move((1, y), go.WHITE)
            gs_reference.do_move((5, y), go.WHITE)

        # board configuration and liberties of gs_capture and of gs_reference should be identical
        self.assertTrue(np.all(gs_reference.board == gs_capture.board))
        self.assertTrue(
            np.all(gs_reference.liberty_counts == gs_capture.liberty_counts))
コード例 #13
0
 def setUp(self):
     self.gs = GameState()
     self.mcts = MCTS(dummy_value, dummy_policy, dummy_rollout, n_playout=2)
コード例 #14
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
    def test_true_eye(self):
        gs = GameState(size=7)
        gs.do_move((1, 0), go.BLACK)
        gs.do_move((0, 1), go.BLACK)

        # false eye at 0, 0
        self.assertTrue(gs.is_eyeish((0, 0), go.BLACK))
        self.assertFalse(gs.is_eye((0, 0), go.BLACK))

        # make it a true eye by turning the corner (1, 1) into an eye itself
        gs.do_move((1, 2), go.BLACK)
        gs.do_move((2, 1), go.BLACK)
        gs.do_move((2, 2), go.BLACK)
        gs.do_move((0, 2), go.BLACK)

        self.assertTrue(gs.is_eyeish((0, 0), go.BLACK))
        self.assertTrue(gs.is_eye((0, 0), go.BLACK))
        self.assertTrue(gs.is_eye((1, 1), go.BLACK))
コード例 #15
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
    def test_standard_ko(self):
        # . B . .
        # B X B .
        # W B W .
        # . W . .
        gs = GameState(size=9)
        gs.do_move((1, 0))  # B
        gs.do_move((2, 0))  # W
        gs.do_move((2, 1))  # B
        gs.do_move((3, 1))  # W
        gs.do_move((1, 2))  # B
        gs.do_move((2, 2))  # W
        gs.do_move((0, 1))  # B

        gs.do_move((1, 1))  # W trigger capture and ko

        self.assertEqual(gs.num_black_prisoners, 1)
        self.assertEqual(gs.num_white_prisoners, 0)

        self.assertFalse(gs.is_legal((2, 1)))

        gs.do_move((5, 5))
        gs.do_move((5, 6))

        self.assertTrue(gs.is_legal((2, 1)))
コード例 #16
0
ファイル: test_policy.py プロジェクト: BismarckDD/BetaGo
 def test_batch_eval_state(self):
     policy = ResnetPolicy(["board", "liberties", "sensibleness", "capture_size"])
     results = policy.batch_eval_state([GameState(), GameState()])
     self.assertEqual(len(results), 2)  # one result per GameState
     self.assertEqual(len(results[0]), 361)  # each one has 361 (move,prob) pairs
コード例 #17
0
ファイル: test_policy.py プロジェクト: BismarckDD/BetaGo
 def test_default_policy(self):
     policy = ResnetPolicy(["board", "liberties", "sensibleness", "capture_size"])
     policy.eval_state(GameState())
コード例 #18
0
 def setUp(self):
     self.gs = GameState()
     self.node = TreeNode(None, 1.0)
コード例 #19
0
class TestMCTS(unittest.TestCase):
    def setUp(self):
        self.gs = GameState()
        self.mcts = MCTS(dummy_value, dummy_policy, dummy_rollout, n_playout=2)

    def _count_expansions(self):
        """Helper function to count the number of expansions past the root using the dummy policy
        """
        node = self.mcts._root
        expansions = 0
        # Loop over actions in decreasing probability.
        for action, _ in sorted(dummy_policy(self.gs),
                                key=lambda (a, p): p,
                                reverse=True):
            if action in node._children:
                expansions += 1
                node = node._children[action]
            else:
                break
        return expansions

    def test_playout(self):
        self.mcts._playout(self.gs.copy(), 8)
        # Assert that the most likely child was visited (according to the dummy policy below).
        self.assertEqual(1, self.mcts._root._children[(18, 18)]._n_visits)
        # Assert that the search depth expanded nodes 8 times.
        self.assertEqual(8, self._count_expansions())

    def test_playout_with_pass(self):
        # Test that playout handles the end of the game (i.e. passing/no moves). Mock this by
        # creating a policy that returns nothing after 4 moves.
        def stop_early_policy(state):
            if len(state.history) <= 4:
                return dummy_policy(state)
            else:
                return []

        self.mcts = MCTS(dummy_value,
                         stop_early_policy,
                         stop_early_policy,
                         n_playout=2)
        self.mcts._playout(self.gs.copy(), 8)
        # Assert that (18, 18) and (18, 17) are still only visited once.
        self.assertEqual(1, self.mcts._root._children[(18, 18)]._n_visits)
        # Assert that no expansions happened after reaching the "end" in 4 moves.
        self.assertEqual(5, self._count_expansions())

    def test_get_move(self):
        move = self.mcts.get_move(self.gs)
        self.mcts.update_with_move(move)
        # success if no errors

    def test_update_with_move(self):
        move = self.mcts.get_move(self.gs)
        self.gs.do_move(move)
        self.mcts.update_with_move(move)
        # Assert that the new root still has children.
        self.assertTrue(len(self.mcts._root._children) > 0)
        # Assert that the new root has no parent (the rest of the tree will be garbage collected).
        self.assertIsNone(self.mcts._root._parent)
        # Assert that the next best move according to the root is (18, 17), according to the
        # dummy policy below.
        self.assertEqual((18, 17), self.mcts._root.select()[0])
コード例 #20
0
ファイル: test_liberties.py プロジェクト: BismarckDD/BetaGo
class TestLiberties(unittest.TestCase):
    def setUp(self):

        #   0 1 2 3 4 5 6 7 8 9 A B
        # 0 . . . . . . . . . . . .
        # 1 . . . . . . . . . . . .
        # 2 . . . . . . . . . . . .
        # 3 . . . . . . . . . . . .
        # 4 . . . . . B B . . . . .
        # 5 . . . . . W B . . . . .
        # 6 . . . . . . B . . . . .
        # 7 . . . . . . . . . . . .
        # 8 . . . . . . . . . . . .
        # 9 . . . . . . . . . . W .
        # A . . . . . . . . . . W W
        # B . . . . . . . . . . . .
        self.s = GameState()
        self.s.do_move((4, 5))
        self.s.do_move((5, 5))
        self.s.do_move((5, 6))
        self.s.do_move((10, 10))
        self.s.do_move((4, 6))
        self.s.do_move((10, 11))
        self.s.do_move((6, 6))
        self.s.do_move((9, 10))

    def test_curr_liberties(self):
        self.assertEqual(self.s.liberty_counts[5][5], 2)
        self.assertEqual(self.s.liberty_counts[4][5], 8)
        self.assertEqual(self.s.liberty_counts[5][6], 8)

    def test_neighbors_edge_cases(self):

        st = GameState()
        st.do_move((0, 0))  # B B . . . . .
        st.do_move((5, 5))  # B W . . . . .
        st.do_move((0, 1))  # . . . . . . .
        st.do_move((6, 6))  # . . . . . . .
        st.do_move((1, 0))  # . . . . . W .
        st.do_move((1, 1))  # . . . . . . W

        # get_group in the corner
        self.assertEqual(len(st.get_group((0, 0))), 3, "group size in corner")

        # get_group of an empty space
        self.assertEqual(len(st.get_group((4, 4))), 0,
                         "group size of empty space")

        # get_group of a single piece
        self.assertEqual(len(st.get_group((5, 5))), 1,
                         "group size of single piece")
コード例 #21
0
ファイル: test_liberties.py プロジェクト: BismarckDD/BetaGo
    def test_neighbors_edge_cases(self):

        st = GameState()
        st.do_move((0, 0))  # B B . . . . .
        st.do_move((5, 5))  # B W . . . . .
        st.do_move((0, 1))  # . . . . . . .
        st.do_move((6, 6))  # . . . . . . .
        st.do_move((1, 0))  # . . . . . W .
        st.do_move((1, 1))  # . . . . . . W

        # get_group in the corner
        self.assertEqual(len(st.get_group((0, 0))), 3, "group size in corner")

        # get_group of an empty space
        self.assertEqual(len(st.get_group((4, 4))), 0,
                         "group size of empty space")

        # get_group of a single piece
        self.assertEqual(len(st.get_group((5, 5))), 1,
                         "group size of single piece")
コード例 #22
0
def play_batch(player_RL, player_SL, batch_size, features):
    """Play a batch of games in parallel and return one training pair
    from each game.
    """

    def do_move(states, moves):
        for st, mv in zip(states, moves):
            if not st.is_end_of_game:
                # Only do more moves if not end of game already
                st.do_move(mv)
        return states

    def do_rand_move(states, player, player_RL):
        """Do a uniform-random move over legal moves and record info for
        training. Only gets called once per game.
        """
        colors = [st.current_player for st in states]  # Record player color
        legal_moves = [st.get_legal_moves() for st in states]
        rand_moves = [lm[np.random.choice(len(lm))] for lm in legal_moves]
        states = do_move(states, rand_moves)
        player = player_RL
        X_list = [st.copy() for st in states]  # For later 1hot preprocessing
        return X_list, colors, states, player

    def convert(X_list, preprocessor):
        """Convert states to 1-hot and concatenate. X's are game state objects.
        """
        states = np.concatenate(
            [preprocessor.state_to_tensor(X) for X in X_list], axis=0)
        return states

    # Lists of game training pairs (1-hot)
    preprocessor = Preprocess(features)
    player = player_SL
    states = [GameState() for i in xrange(batch_size)]
    # Randomly choose turn to play uniform random. Move prior will be from SL
    # policy. Moves after will be from RL policy.
    i_rand_move = np.random.choice(range(450))
    X_list = None
    winners = None
    turn = 0
    while True:
        # Do moves (black)
        if turn == i_rand_move:
            # Make random move, then switch from SL to RL policy
            X_list, colors, states, player = do_rand_move(states, player,
                                                          player_RL)
        else:
            # Get moves (batch)
            moves_black = player.get_moves(states)
            # Do moves (black)
            states = do_move(states, moves_black)
        turn += 1
        # Do moves (white)
        if turn == i_rand_move:
            # Make random move, then switch from SL to RL policy
            X_list, colors, states, player = do_rand_move(states, player,
                                                          player_RL)
        else:
            moves_white = player.get_moves(states)
            states = do_move(states, moves_white)
        turn += 1
        # If all games have ended, we're done. Get winners.
        done = [st.is_end_of_game or st.turns_played > 500 for st in states]
        print turn
        if all(done):
            break
    # Concatenate training examples
    X = None
    if X_list is not None:
        X = convert(X_list, preprocessor)
    winners = np.array([st.get_winner() for st in states]).reshape(batch_size, 1)
    return X, winners
コード例 #23
0
ファイル: test_gamestate.py プロジェクト: BismarckDD/BetaGo
    def test_simple_eye(self):

        # create a black eye in top left (1, 1), white in bottom right (5, 5)

        gs = GameState(size=7)
        gs.do_move((1, 0))  # B
        gs.do_move((5, 4))  # W
        gs.do_move((2, 1))  # B
        gs.do_move((6, 5))  # W
        gs.do_move((1, 2))  # B
        gs.do_move((5, 6))  # W
        gs.do_move((0, 1))  # B
        gs.do_move((4, 5))  # W

        # test black eye top left
        self.assertTrue(gs.is_eyeish((1, 1), go.BLACK))
        self.assertFalse(gs.is_eyeish((1, 1), go.WHITE))

        # test white eye bottom right
        self.assertTrue(gs.is_eyeish((5, 5), go.WHITE))
        self.assertFalse(gs.is_eyeish((5, 5), go.BLACK))

        # test no eye in other random positions
        self.assertFalse(gs.is_eyeish((1, 0), go.BLACK))
        self.assertFalse(gs.is_eyeish((1, 0), go.WHITE))
        self.assertFalse(gs.is_eyeish((2, 2), go.BLACK))
        self.assertFalse(gs.is_eyeish((2, 2), go.WHITE))
コード例 #24
0
def run_n_games(optimizer, learner, opponent, num_games):
    '''Run num_games games to completion, calling train_batch() on each position
    the learner sees.

    (Note: optimizer only accumulates gradients in its update function until
    all games have finished)

    '''
    board_size = learner.policy.model.input_shape[-1]
    states = [GameState(size=board_size) for _ in range(num_games)]
    learner_net = learner.policy.model

    # Start all odd games with moves by 'opponent'. Even games will have 'learner' black.
    learner_color = [
        go.BLACK if i % 2 == 0 else go.WHITE for i in range(num_games)
    ]
    odd_states = states[1::2]
    moves = opponent.get_moves(odd_states)
    for st, mv in zip(odd_states, moves):
        st.do_move(mv)

    current = learner
    other = opponent
    # Need to keep track of the index of unfinished states so that we can communicate which one is
    # being updated to the optimizer.
    idxs_to_unfinished_states = {i: states[i] for i in range(num_games)}
    while len(idxs_to_unfinished_states) > 0:
        # Get next moves by current player for all unfinished states.
        moves = current.get_moves(idxs_to_unfinished_states.values())
        just_finished = []
        # Do each move to each state in order.
        for (idx, state), mv in zip(idxs_to_unfinished_states.iteritems(),
                                    moves):
            # Order is important here. We must first get the training pair on the unmodified state.
            # Next, the state is updated and checked to see if the game is over. If it is over, the
            # optimizer is notified via set_result. Finally, train_on_batch is called, which
            # will trigger an update of all parameters only if set_result() has been called
            # for all games already (so set_result must come before train_on_batch).
            is_learnable = current is learner and mv is not go.PASS_MOVE
            if is_learnable:
                (X, y) = _make_training_pair(state, mv,
                                             learner.policy.preprocessor)
            state.do_move(mv)
            if state.is_end_of_game:
                learner_is_winner = state.get_winner() == learner_color[idx]
                optimizer.set_result(idx, learner_is_winner)
                just_finished.append(idx)
            if is_learnable:
                optimizer.set_current_game(idx)
                learner_net.train_on_batch(X, y)

        # Remove games that have finished from dict.
        for idx in just_finished:
            del idxs_to_unfinished_states[idx]

        # Swap 'current' and 'other' for next turn.
        current, other = other, current

    # Return the win ratio.
    wins = sum(state.get_winner() == pc
               for (state, pc) in zip(states, learner_color))
    return float(wins) / num_games