Python board_to_stream Exemples, AI_import.board_to_stream Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : AI_mcts.py Projet : xsir317/Renju-AI

 def evaluation(self, leaf_node, random_prob=0.1, host=None):
     """
         evaluation phase
     :param leaf_node:
     :return:
     """
     reward = self.rpc.simulate_rpc("policy_rollout",
                                    board_to_stream(
                                        leaf_node.position.board),
                                    leaf_node.position.get_player_name(),
                                    host=host)
     # game = leaf_node.position.replicate_game()
     # while True:  # loop game
     #     predict_vals = self.rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name(), host=host)
     #     # predict_vals = self.rpc.policy_rollout_rpc(board_to_stream(game.board), game.get_player_name())
     #     if random.random() < random_prob:
     #         action = game.choose_action(predict_vals)
     #     else:  # choose second best
     #         action = game.weighted_choose_action(predict_vals)
     #     if action is None:
     #         return 0
     #     _, reward_n, terminal_n = game.step_games(action)
     #     if terminal_n:
     #         return reward_n
     return reward

Exemple #2

0

Afficher le fichier

Fichier : AI_mcts.py Projet : amoliu/Renju-AI

 def expansion(self, leaf_node, select_track):
     """
         expansion phase
     :param leaf_node:
     :param leaf_node_parent:
     :param select_track:
     :return:
     """
     last_select = select_track[-1]
     last_select_edge = leaf_node.parent.edges[last_select]
     if last_select_edge.rollout_rewards > self.visit_threshold:
         print "**expand one node"
         # append leaf node to search tree
         leaf_node.parent.child[last_select] = leaf_node
         # generate edges for new node
         board_stream = board_to_stream(leaf_node.position.board)
         prior_probs = self.rpc.policy_dl_rpc(board_stream, leaf_node.position.get_player_name())
         prior_probs = normalize_prior_probs(prior_probs)
         leaf_node.generate_edges(prior_probs)

Exemple #3

0

Afficher le fichier

Fichier : AI_mcts.py Projet : xsir317/Renju-AI

 def expansion(self, leaf_node, select_track):
     """
         expansion phase
     :param leaf_node:
     :param leaf_node_parent:
     :param select_track:
     :return:
     """
     last_select = select_track[-1]
     last_select_edge = leaf_node.parent.edges[last_select]
     if last_select_edge.rollout_rewards > self.visit_threshold:
         print "**expand one node"
         # append leaf node to search tree
         leaf_node.parent.child[last_select] = leaf_node
         # generate edges for new node
         board_stream = board_to_stream(leaf_node.position.board)
         prior_probs = self.rpc.policy_dl_rpc(
             board_stream, leaf_node.position.get_player_name())
         prior_probs = normalize_prior_probs(prior_probs)
         leaf_node.generate_edges(prior_probs)

Exemple #4

0

Afficher le fichier

Fichier : AI_mcts.py Projet : amoliu/Renju-AI

 def decision(self, action, thread_name):
     thread = self.threads[thread_name]
     thread.set_signal(SIGNAL_PAUSE)
     for idx in xrange(thread.root.child_num()):
         if thread.root.edges[idx].action == action:
             if thread.root.child[idx] is None:
                 child_node_position = thread.root.position.replicate_game()
                 child_node_position.step_games(thread.root.edges[idx].action)
                 thread.root.child[idx] = Node(child_node_position, parent=thread.root)
             thread.root = thread.root.child[idx]
             break
     if thread.root.child_num() == 0:
         prior_probs = self.mcts.rpc.policy_dl_rpc(board_to_stream(thread.root.position.board),
                                                   thread.root.position.get_player_name())
         # normalize prior probs
         prior_probs = normalize_prior_probs(prior_probs)
         thread.root.generate_edges(prior_probs)
     thread.root, action = self.mcts.decision(thread.root)
     thread.set_signal(SIGNAL_RUNNING)
     return action

Exemple #5

0

Afficher le fichier

Fichier : AI_mcts.py Projet : xsir317/Renju-AI

 def selection(self, root):
     """
         selection phase
     :param root:
     :return:
     """
     select_track = []
     # tree traversal
     node, node_parent = root, None
     print "select track: [",
     while node is not None:
         # print node.position.board
         if node.child_num() == 0:
             prior_probs = self.rpc.policy_dl_rpc(
                 board_to_stream(node.position.board),
                 node.position.get_player_name())
             # normalize prior probs
             prior_probs = normalize_prior_probs(prior_probs)
             node.generate_edges(prior_probs)
         act_q_values = np.empty(node.child_num(), dtype=float)
         for idx, edge in enumerate(node.edges):
             act_q_values[idx] = edge.edge_weight(self.explore_rate)
             # act_q_values[idx] = edge.edge_bonus(self.explore_rate)
         # if node.position.player == RenjuGame.PLAYER_WHITE:  # min for white player
         #     act_q_values = -act_q_values
         best_edge_idx = np.argmax(act_q_values)
         # move to child node
         node_parent = node
         node = node.child[best_edge_idx]
         # store select track
         select_track.append(best_edge_idx)
         print transform_action(
             node_parent.edges[best_edge_idx].action), ", ",
     print "]"
     # create leaf node
     last_best_edge = node_parent.edges[select_track[-1]]
     leaf_node_position = node_parent.position.replicate_game()
     leaf_node_position.step_games(last_best_edge.action)
     leaf_node = Node(leaf_node_position, parent=node_parent)
     # leaf_node_parent.child[select_track[-1]] = leaf_node
     return leaf_node, select_track

Exemple #6

0

Afficher le fichier

Fichier : AI_mcts.py Projet : amoliu/Renju-AI

 def evaluation(self, leaf_node, random_prob=0.1, host=None):
     """
         evaluation phase
     :param leaf_node:
     :return:
     """
     reward = self.rpc.simulate_rpc("policy_rollout", board_to_stream(leaf_node.position.board),
                                    leaf_node.position.get_player_name(), host=host)
     # game = leaf_node.position.replicate_game()
     # while True:  # loop game
     #     predict_vals = self.rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name(), host=host)
     #     # predict_vals = self.rpc.policy_rollout_rpc(board_to_stream(game.board), game.get_player_name())
     #     if random.random() < random_prob:
     #         action = game.choose_action(predict_vals)
     #     else:  # choose second best
     #         action = game.weighted_choose_action(predict_vals)
     #     if action is None:
     #         return 0
     #     _, reward_n, terminal_n = game.step_games(action)
     #     if terminal_n:
     #         return reward_n
     return reward

Exemple #7

0

Afficher le fichier

Fichier : AI_mcts.py Projet : amoliu/Renju-AI

 def selection(self, root):
     """
         selection phase
     :param root:
     :return:
     """
     select_track = []
     # tree traversal
     node, node_parent = root, None
     print "select track: [",
     while node is not None:
         # print node.position.board
         if node.child_num() == 0:
             prior_probs = self.rpc.policy_dl_rpc(board_to_stream(node.position.board),
                                                  node.position.get_player_name())
             # normalize prior probs
             prior_probs = normalize_prior_probs(prior_probs)
             node.generate_edges(prior_probs)
         act_q_values = np.empty(node.child_num(), dtype=float)
         for idx, edge in enumerate(node.edges):
             act_q_values[idx] = edge.edge_weight(self.explore_rate)
             # act_q_values[idx] = edge.edge_bonus(self.explore_rate)
         # if node.position.player == RenjuGame.PLAYER_WHITE:  # min for white player
         #     act_q_values = -act_q_values
         best_edge_idx = np.argmax(act_q_values)
         # move to child node
         node_parent = node
         node = node.child[best_edge_idx]
         # store select track
         select_track.append(best_edge_idx)
         print transform_action(node_parent.edges[best_edge_idx].action), ", ",
     print "]"
     # create leaf node
     last_best_edge = node_parent.edges[select_track[-1]]
     leaf_node_position = node_parent.position.replicate_game()
     leaf_node_position.step_games(last_best_edge.action)
     leaf_node = Node(leaf_node_position, parent=node_parent)
     # leaf_node_parent.child[select_track[-1]] = leaf_node
     return leaf_node, select_track

Exemple #8

0

Afficher le fichier

Fichier : AI_mcts.py Projet : xsir317/Renju-AI

 def decision(self, action, thread_name):
     thread = self.threads[thread_name]
     thread.set_signal(SIGNAL_PAUSE)
     for idx in xrange(thread.root.child_num()):
         if thread.root.edges[idx].action == action:
             if thread.root.child[idx] is None:
                 child_node_position = thread.root.position.replicate_game()
                 child_node_position.step_games(
                     thread.root.edges[idx].action)
                 thread.root.child[idx] = Node(child_node_position,
                                               parent=thread.root)
             thread.root = thread.root.child[idx]
             break
     if thread.root.child_num() == 0:
         prior_probs = self.mcts.rpc.policy_dl_rpc(
             board_to_stream(thread.root.position.board),
             thread.root.position.get_player_name())
         # normalize prior probs
         prior_probs = normalize_prior_probs(prior_probs)
         thread.root.generate_edges(prior_probs)
     thread.root, action = self.mcts.decision(thread.root)
     thread.set_signal(SIGNAL_RUNNING)
     return action

Exemple #9

0

Afficher le fichier

Fichier : AI_value_net.py Projet : amoliu/Renju-AI

def sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=225):
    """
    :param max_steps: max time steps in games
    :return:
    """
    sample_games = []
    if os.path.exists(sample_file):
        sample_games = cPickle.load(open(sample_file, 'rb'))
        logger.info("load sample file: %s, samples=%d" % (sample_file, len(sample_games)))
    sample_sets = set()  # used to check unique sample
    game = RenjuGame()
    record_policy_dl_boards = []
    # move step by policy dl
    game.reset_game()
    record_policy_dl_boards.append(game.replicate_game())
    while True:
        action = game.choose_action(
            rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name()))
        if action is None:
            break
        state, _, terminal = game.step_games(action)
        if terminal:
            break
        record_policy_dl_boards.append(game.replicate_game())
    max_time_steps = min(max_time_steps, len(record_policy_dl_boards)) - 1
    # sample game
    while len(sample_games) < sample_num:
        sampled_game = None
        while True:  # loop to find legal sample
            flag_time_step = random.randint(1, max_time_steps)
            recorded_game = record_policy_dl_boards[flag_time_step - 1].replicate_game()
            random_action = recorded_game.random_action()
            if random_action is None:
                break
            random_state, _, terminal = recorded_game.step_games(random_action)
            if not terminal and not str(random_state) in sample_sets:
                sample_sets.add(str(random_state))
                break
        if random_action is None:  # invalid loop
            continue
        # move step by policy rl
        time_step = flag_time_step
        while True:  # simulate game by policy rl
            actions = rpc.policy_rl_rpc(board_to_stream(recorded_game.board), recorded_game.get_player_name())
            action = recorded_game.choose_action(actions)
            if action is None:  # game drawn
                sampled_reward = 0
                break
            state, reward, terminal = recorded_game.step_games(action)
            time_step += 1
            if time_step == (flag_time_step + 1):  # record board
                sampled_game = recorded_game.replicate_game()
            if terminal:  # record value
                sampled_reward = reward
                break
        if sampled_game is not None:
            sample_games.append((sampled_game, sampled_reward))
            logger.info("sample simulate, sample_step=%d, time_step=%d" % (len(sample_games), time_step))
        if len(sample_games) % 100 == 0:
            cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
            logger.info("create value network sample, step=%d" % len(sample_games))
    return sample_games

Exemple #10

0

Afficher le fichier

def sampling_for_value_network(rpc,
                               sample_num,
                               sample_file,
                               max_time_steps=225):
    """
    :param max_steps: max time steps in games
    :return:
    """
    sample_games = []
    if os.path.exists(sample_file):
        sample_games = cPickle.load(open(sample_file, 'rb'))
        logger.info("load sample file: %s, samples=%d" %
                    (sample_file, len(sample_games)))
    sample_sets = set()  # used to check unique sample
    game = RenjuGame()
    record_policy_dl_boards = []
    # move step by policy dl
    game.reset_game()
    record_policy_dl_boards.append(game.replicate_game())
    while True:
        action = game.choose_action(
            rpc.policy_dl_rpc(board_to_stream(game.board),
                              game.get_player_name()))
        if action is None:
            break
        state, _, terminal = game.step_games(action)
        if terminal:
            break
        record_policy_dl_boards.append(game.replicate_game())
    max_time_steps = min(max_time_steps, len(record_policy_dl_boards)) - 1
    # sample game
    while len(sample_games) < sample_num:
        sampled_game = None
        while True:  # loop to find legal sample
            flag_time_step = random.randint(1, max_time_steps)
            recorded_game = record_policy_dl_boards[flag_time_step -
                                                    1].replicate_game()
            random_action = recorded_game.random_action()
            if random_action is None:
                break
            random_state, _, terminal = recorded_game.step_games(random_action)
            if not terminal and not str(random_state) in sample_sets:
                sample_sets.add(str(random_state))
                break
        if random_action is None:  # invalid loop
            continue
        # move step by policy rl
        time_step = flag_time_step
        while True:  # simulate game by policy rl
            actions = rpc.policy_rl_rpc(board_to_stream(recorded_game.board),
                                        recorded_game.get_player_name())
            action = recorded_game.choose_action(actions)
            if action is None:  # game drawn
                sampled_reward = 0
                break
            state, reward, terminal = recorded_game.step_games(action)
            time_step += 1
            if time_step == (flag_time_step + 1):  # record board
                sampled_game = recorded_game.replicate_game()
            if terminal:  # record value
                sampled_reward = reward
                break
        if sampled_game is not None:
            sample_games.append((sampled_game, sampled_reward))
            logger.info("sample simulate, sample_step=%d, time_step=%d" %
                        (len(sample_games), time_step))
        if len(sample_games) % 100 == 0:
            cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
            logger.info("create value network sample, step=%d" %
                        len(sample_games))
    return sample_games

Exemple #11

0

Afficher le fichier

Fichier : AI_policy_RL.py Projet : xsir317/Renju-AI

    def train_policy_network(self,
                             rpc,
                             batch_games=128,
                             save_step=50000,
                             max_model_pools=5,
                             init_epsilon=0.5,
                             final_epsilon=0.05,
                             explore=1000000,
                             action_repeat=20,
                             mini_batch_size=128):
        """
            data set from self-play
        :return:
        """
        game = RenjuGame()
        batch_states, batch_actions, batch_rewards = deque(), deque(), deque()
        mini_batch_states, mini_batch_actions, mini_batch_rewards = [
            0
        ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size
        model_pools = []
        params = self.param_unserierlize(init_params={
            "global_step": 0,
            "epsilon": init_epsilon
        })
        global_step_val, epsilon = params["global_step"], params["epsilon"]
        train_step = 0
        while True:
            start_time = time.time()
            # choose policy network for opponent player from model pools
            if train_step % 10 == 0:
                if len(model_pools) > 0:
                    model_file = random.choice(model_pools)
                else:
                    model_file = None
                rpc.switch_model("policy_rl", model_file=model_file)
            while len(batch_states) < batch_games:
                # opponent_policy = self.load_history_policy_model(model_file)
                black_opponent = random.choice([True, False])
                # reset game
                game.reset_game()
                # simulate game by current parameter
                states, actions, rewards = [], [], []
                state = game.step_games(None)
                while True:  # loop current game
                    # self-play, current model V.S. history model
                    if random.random() < epsilon:  # random choose action
                        action = game.random_action()
                    else:
                        if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \
                                or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE):
                            action = game.choose_action(
                                rpc.policy_rl_rpc(board_to_stream(game.board),
                                                  game.get_player_name()))
                        else:  # current player
                            action = game.choose_action(
                                self.predict([state])[0])
                    # step game
                    state_n, reward_n, terminal_n = game.step_games(action)
                    # print "game=", batch_step, ", move=", transform_action(action)
                    # store (state, action)
                    states.append(state)
                    one_hot_act = one_hot_action(action)
                    actions.append(one_hot_act)
                    # set new states
                    state = state_n
                    if terminal_n:
                        final_reward = reward_n
                        # logger.info("winner=%s" % ("black" if reward_n > 0 else "white"))
                        break
                    # check whether game drawn
                    if game.random_action(
                    ) is None:  # game drawn, equal end, reward=0
                        final_reward = 0
                        logger.info("game drawn, so amazing...")
                        break
                # store (reward)
                for step in xrange(len(states)):
                    if step % 2 == 0:
                        rewards.append(final_reward)
                    else:
                        rewards.append(-final_reward)
                # store states of ith game
                batch_states.append(states)
                batch_actions.append(actions)
                batch_rewards.append(rewards)
            # fit model by mini batch
            avg_loss, avg_acc = 0.0, 0.0
            for _ in xrange(action_repeat):
                train_step += 1
                for idx in xrange(mini_batch_size):
                    game_idx = random.randint(0, len(batch_states) - 1)
                    game_time_step_idx = random.randint(
                        0,
                        len(batch_states[game_idx]) - 1)
                    mini_batch_states[idx] = batch_states[game_idx][
                        game_time_step_idx]
                    mini_batch_actions[idx] = batch_actions[game_idx][
                        game_time_step_idx]
                    mini_batch_rewards[idx] = batch_rewards[game_idx][
                        game_time_step_idx]
                _, global_step_val, loss, acc = self.fit(mini_batch_states,
                                                         mini_batch_actions,
                                                         mini_batch_rewards,
                                                         fetch_info=True)
                avg_loss += loss
                avg_acc += acc
                # update epsilon
                if epsilon > final_epsilon:
                    epsilon -= (init_epsilon - final_epsilon) / explore
            avg_loss /= action_repeat
            avg_acc /= action_repeat
            batch_states.popleft()
            batch_actions.popleft()
            batch_rewards.popleft()

            global_step_val = int(global_step_val)
            elapsed_time = int(time.time() - start_time)
            logger.info(
                "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)"
                % (global_step_val, epsilon, avg_loss, avg_acc, elapsed_time))
            # save model
            if train_step % save_step == 0:
                params["global_step"], params[
                    "epsilon"] = global_step_val, epsilon
                self.param_serierlize(params)
                model_file = self.save_model("policy_rl",
                                             global_step=global_step_val)
                logger.info("save policy dl model, file=%s" % model_file)
                model_file = model_file[len(self.model_dir):]
                # add history model to pool
                model_pools.append(model_file)
                if len(model_pools
                       ) > max_model_pools:  # pop head when model pools exceed
                    model_pools.pop(0)
                logger.info("model pools has files: [%s]" %
                            (", ".join(model_pools)))

Exemple #12

0

Afficher le fichier

Fichier : AI_multi_GPU_RL_v2.py Projet : xsir317/Renju-AI

def train_rl_network(batch_games=128,
                     save_step=10000,
                     max_model_pools=5,
                     init_epsilon=0.5,
                     final_epsilon=0.01,
                     explore=1000000,
                     action_repeat=32,
                     mini_batch_size=64):
    """
        data set from self-play
    :return:
    """
    args = parser_argument().parse_args()
    rpc = ModelRPC(args)
    game = RenjuGame()
    batch_states, batch_actions, batch_rewards = deque(), deque(), deque()
    mini_batch_states, mini_batch_actions, mini_batch_rewards = [
        0
    ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size
    model_pools = []
    param_file = "%s/param.json" % train_dir
    params = param_unserierlize(param_file,
                                init_params={
                                    "global_step": 0,
                                    "epsilon": init_epsilon
                                })
    global_step_val, epsilon = params["global_step"], params["epsilon"]
    # load model
    sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network(
    )
    train_step = 0
    while True:
        start_time = time.time()
        # choose policy network for opponent player from model pools
        if train_step % 10 == 0:
            if len(model_pools) > 0:
                model_file = random.choice(model_pools)
            else:
                model_file = None
            rpc.switch_model("policy_rl", model_file=model_file)
        while len(batch_states) < batch_games:
            # opponent_policy = self.load_history_policy_model(model_file)
            black_opponent = random.choice([True, False])
            # reset game
            game.reset_game()
            # simulate game by current parameter
            states, actions, rewards = [], [], []
            state = game.step_games(None)
            while True:  # loop current game
                # self-play, current model V.S. history model
                if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \
                        or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE):
                    predict_probs = rpc.policy_rl_rpc(
                        board_to_stream(game.board), game.get_player_name())
                else:  # current player
                    predict_probs = sess.run(
                        [tower_logits[0]],
                        feed_dict={tower_feeds[0][0]: [state]})[0][0]
                if random.random() < epsilon:  # random choose action
                    action = game.weighted_choose_action(predict_probs)
                else:
                    action = game.choose_action(predict_probs)
                if action is None:
                    final_reward = 0
                    break
                # step game
                state_n, reward_n, terminal_n = game.step_games(action)
                # store (state, action)
                states.append(state)
                actions.append(action)
                # set new states
                state = state_n
                if terminal_n:
                    final_reward = reward_n
                    break
                # check whether game drawn
                if game.random_action(
                ) is None:  # game drawn, equal end, reward=0
                    final_reward = 0
                    logger.info("game drawn, so amazing...")
                    break
            # store (reward)
            for step in xrange(len(states)):
                if step % 2 == 0:
                    rewards.append(final_reward)
                else:
                    rewards.append(-final_reward)
            # store states of ith game
            batch_states.append(states)
            batch_actions.append(actions)
            batch_rewards.append(rewards)
        # fit model by mini batch
        avg_loss, avg_acc = 0.0, 0.0
        for _ in xrange(action_repeat / gpu_num):
            train_step += 1
            feeds = {}
            for gpu_id in xrange(gpu_num):
                for idx in xrange(mini_batch_size):
                    game_idx = random.randint(0, len(batch_states) - 1)
                    game_time_step_idx = random.randint(
                        0,
                        len(batch_states[game_idx]) - 1)
                    mini_batch_states[idx] = batch_states[game_idx][
                        game_time_step_idx]
                    mini_batch_actions[idx] = batch_actions[game_idx][
                        game_time_step_idx]
                    mini_batch_rewards[idx] = batch_rewards[game_idx][
                        game_time_step_idx]
                feeds[tower_feeds[gpu_id][0]] = mini_batch_states
                feeds[tower_feeds[gpu_id][1]] = mini_batch_actions
                feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards
            _, global_step_val, loss_val, acc_val = sess.run(
                [train_op, global_step, loss, accuracy], feed_dict=feeds)
            avg_loss += loss_val
            avg_acc += acc_val
            # update epsilon
            if epsilon > final_epsilon:
                epsilon -= (init_epsilon - final_epsilon) / explore
        avg_loss /= action_repeat
        avg_acc /= action_repeat
        batch_states.popleft()
        batch_actions.popleft()
        batch_rewards.popleft()

        global_step_val = int(global_step_val)
        elapsed_time = int(time.time() - start_time)
        logger.info(
            "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)"
            % (train_step, epsilon, avg_loss, avg_acc, elapsed_time))
        # save model
        if train_step % save_step == 0:
            params["global_step"], params["epsilon"] = global_step_val, epsilon
            param_serierlize(param_file, params)
            model_file = save_model(sess,
                                    train_dir,
                                    saver,
                                    "policy_rl_step_%d" % train_step,
                                    global_step=global_step_val)
            logger.info("save policy rl model, file=%s" % model_file)
            model_file = model_file[len(train_dir):]
            # add history model to pool
            model_pools.append(model_file)
            if len(model_pools
                   ) > max_model_pools:  # pop head when model pools exceed
                model_pools.pop(0)
            logger.info("model pools has files: [%s]" %
                        (", ".join(model_pools)))

Exemple #13

0

Afficher le fichier

Fichier : AI_multi_GPU_RL_v2.py Projet : amoliu/Renju-AI

def train_rl_network(batch_games=128, save_step=10000,
                     max_model_pools=5, init_epsilon=0.5, final_epsilon=0.01, explore=1000000,
                     action_repeat=32, mini_batch_size=64):
    """
        data set from self-play
    :return:
    """
    args = parser_argument().parse_args()
    rpc = ModelRPC(args)
    game = RenjuGame()
    batch_states, batch_actions, batch_rewards = deque(), deque(), deque()
    mini_batch_states, mini_batch_actions, mini_batch_rewards = [0] * mini_batch_size, [0] * mini_batch_size, [
        0] * mini_batch_size
    model_pools = []
    param_file = "%s/param.json" % train_dir
    params = param_unserierlize(param_file, init_params={"global_step": 0, "epsilon": init_epsilon})
    global_step_val, epsilon = params["global_step"], params["epsilon"]
    # load model
    sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network()
    train_step = 0
    while True:
        start_time = time.time()
        # choose policy network for opponent player from model pools
        if train_step % 10 == 0:
            if len(model_pools) > 0:
                model_file = random.choice(model_pools)
            else:
                model_file = None
            rpc.switch_model("policy_rl", model_file=model_file)
        while len(batch_states) < batch_games:
            # opponent_policy = self.load_history_policy_model(model_file)
            black_opponent = random.choice([True, False])
            # reset game
            game.reset_game()
            # simulate game by current parameter
            states, actions, rewards = [], [], []
            state = game.step_games(None)
            while True:  # loop current game
                # self-play, current model V.S. history model
                if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \
                        or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE):
                    predict_probs = rpc.policy_rl_rpc(board_to_stream(game.board), game.get_player_name())
                else:  # current player
                    predict_probs = sess.run([tower_logits[0]], feed_dict={tower_feeds[0][0]: [state]})[0][0]
                if random.random() < epsilon:  # random choose action
                    action = game.weighted_choose_action(predict_probs)
                else:
                    action = game.choose_action(predict_probs)
                if action is None:
                    final_reward = 0
                    break
                # step game
                state_n, reward_n, terminal_n = game.step_games(action)
                # store (state, action)
                states.append(state)
                actions.append(action)
                # set new states
                state = state_n
                if terminal_n:
                    final_reward = reward_n
                    break
                # check whether game drawn
                if game.random_action() is None:  # game drawn, equal end, reward=0
                    final_reward = 0
                    logger.info("game drawn, so amazing...")
                    break
            # store (reward)
            for step in xrange(len(states)):
                if step % 2 == 0:
                    rewards.append(final_reward)
                else:
                    rewards.append(-final_reward)
            # store states of ith game
            batch_states.append(states)
            batch_actions.append(actions)
            batch_rewards.append(rewards)
        # fit model by mini batch
        avg_loss, avg_acc = 0.0, 0.0
        for _ in xrange(action_repeat / gpu_num):
            train_step += 1
            feeds = {}
            for gpu_id in xrange(gpu_num):
                for idx in xrange(mini_batch_size):
                    game_idx = random.randint(0, len(batch_states) - 1)
                    game_time_step_idx = random.randint(0, len(batch_states[game_idx]) - 1)
                    mini_batch_states[idx] = batch_states[game_idx][game_time_step_idx]
                    mini_batch_actions[idx] = batch_actions[game_idx][game_time_step_idx]
                    mini_batch_rewards[idx] = batch_rewards[game_idx][game_time_step_idx]
                feeds[tower_feeds[gpu_id][0]] = mini_batch_states
                feeds[tower_feeds[gpu_id][1]] = mini_batch_actions
                feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards
            _, global_step_val, loss_val, acc_val = sess.run([train_op, global_step, loss, accuracy], feed_dict=feeds)
            avg_loss += loss_val
            avg_acc += acc_val
            # update epsilon
            if epsilon > final_epsilon:
                epsilon -= (init_epsilon - final_epsilon) / explore
        avg_loss /= action_repeat
        avg_acc /= action_repeat
        batch_states.popleft()
        batch_actions.popleft()
        batch_rewards.popleft()

        global_step_val = int(global_step_val)
        elapsed_time = int(time.time() - start_time)
        logger.info(
            "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" %
            (train_step, epsilon, avg_loss, avg_acc, elapsed_time))
        # save model
        if train_step % save_step == 0:
            params["global_step"], params["epsilon"] = global_step_val, epsilon
            param_serierlize(param_file, params)
            model_file = save_model(sess, train_dir, saver,
                                    "policy_rl_step_%d" % train_step,
                                    global_step=global_step_val)
            logger.info("save policy rl model, file=%s" % model_file)
            model_file = model_file[len(train_dir):]
            # add history model to pool
            model_pools.append(model_file)
            if len(model_pools) > max_model_pools:  # pop head when model pools exceed
                model_pools.pop(0)
            logger.info("model pools has files: [%s]" % (", ".join(model_pools)))