def evaluation(self, leaf_node, random_prob=0.1, host=None): """ evaluation phase :param leaf_node: :return: """ reward = self.rpc.simulate_rpc("policy_rollout", board_to_stream( leaf_node.position.board), leaf_node.position.get_player_name(), host=host) # game = leaf_node.position.replicate_game() # while True: # loop game # predict_vals = self.rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name(), host=host) # # predict_vals = self.rpc.policy_rollout_rpc(board_to_stream(game.board), game.get_player_name()) # if random.random() < random_prob: # action = game.choose_action(predict_vals) # else: # choose second best # action = game.weighted_choose_action(predict_vals) # if action is None: # return 0 # _, reward_n, terminal_n = game.step_games(action) # if terminal_n: # return reward_n return reward
def expansion(self, leaf_node, select_track): """ expansion phase :param leaf_node: :param leaf_node_parent: :param select_track: :return: """ last_select = select_track[-1] last_select_edge = leaf_node.parent.edges[last_select] if last_select_edge.rollout_rewards > self.visit_threshold: print "**expand one node" # append leaf node to search tree leaf_node.parent.child[last_select] = leaf_node # generate edges for new node board_stream = board_to_stream(leaf_node.position.board) prior_probs = self.rpc.policy_dl_rpc(board_stream, leaf_node.position.get_player_name()) prior_probs = normalize_prior_probs(prior_probs) leaf_node.generate_edges(prior_probs)
def expansion(self, leaf_node, select_track): """ expansion phase :param leaf_node: :param leaf_node_parent: :param select_track: :return: """ last_select = select_track[-1] last_select_edge = leaf_node.parent.edges[last_select] if last_select_edge.rollout_rewards > self.visit_threshold: print "**expand one node" # append leaf node to search tree leaf_node.parent.child[last_select] = leaf_node # generate edges for new node board_stream = board_to_stream(leaf_node.position.board) prior_probs = self.rpc.policy_dl_rpc( board_stream, leaf_node.position.get_player_name()) prior_probs = normalize_prior_probs(prior_probs) leaf_node.generate_edges(prior_probs)
def decision(self, action, thread_name): thread = self.threads[thread_name] thread.set_signal(SIGNAL_PAUSE) for idx in xrange(thread.root.child_num()): if thread.root.edges[idx].action == action: if thread.root.child[idx] is None: child_node_position = thread.root.position.replicate_game() child_node_position.step_games(thread.root.edges[idx].action) thread.root.child[idx] = Node(child_node_position, parent=thread.root) thread.root = thread.root.child[idx] break if thread.root.child_num() == 0: prior_probs = self.mcts.rpc.policy_dl_rpc(board_to_stream(thread.root.position.board), thread.root.position.get_player_name()) # normalize prior probs prior_probs = normalize_prior_probs(prior_probs) thread.root.generate_edges(prior_probs) thread.root, action = self.mcts.decision(thread.root) thread.set_signal(SIGNAL_RUNNING) return action
def selection(self, root): """ selection phase :param root: :return: """ select_track = [] # tree traversal node, node_parent = root, None print "select track: [", while node is not None: # print node.position.board if node.child_num() == 0: prior_probs = self.rpc.policy_dl_rpc( board_to_stream(node.position.board), node.position.get_player_name()) # normalize prior probs prior_probs = normalize_prior_probs(prior_probs) node.generate_edges(prior_probs) act_q_values = np.empty(node.child_num(), dtype=float) for idx, edge in enumerate(node.edges): act_q_values[idx] = edge.edge_weight(self.explore_rate) # act_q_values[idx] = edge.edge_bonus(self.explore_rate) # if node.position.player == RenjuGame.PLAYER_WHITE: # min for white player # act_q_values = -act_q_values best_edge_idx = np.argmax(act_q_values) # move to child node node_parent = node node = node.child[best_edge_idx] # store select track select_track.append(best_edge_idx) print transform_action( node_parent.edges[best_edge_idx].action), ", ", print "]" # create leaf node last_best_edge = node_parent.edges[select_track[-1]] leaf_node_position = node_parent.position.replicate_game() leaf_node_position.step_games(last_best_edge.action) leaf_node = Node(leaf_node_position, parent=node_parent) # leaf_node_parent.child[select_track[-1]] = leaf_node return leaf_node, select_track
def evaluation(self, leaf_node, random_prob=0.1, host=None): """ evaluation phase :param leaf_node: :return: """ reward = self.rpc.simulate_rpc("policy_rollout", board_to_stream(leaf_node.position.board), leaf_node.position.get_player_name(), host=host) # game = leaf_node.position.replicate_game() # while True: # loop game # predict_vals = self.rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name(), host=host) # # predict_vals = self.rpc.policy_rollout_rpc(board_to_stream(game.board), game.get_player_name()) # if random.random() < random_prob: # action = game.choose_action(predict_vals) # else: # choose second best # action = game.weighted_choose_action(predict_vals) # if action is None: # return 0 # _, reward_n, terminal_n = game.step_games(action) # if terminal_n: # return reward_n return reward
def selection(self, root): """ selection phase :param root: :return: """ select_track = [] # tree traversal node, node_parent = root, None print "select track: [", while node is not None: # print node.position.board if node.child_num() == 0: prior_probs = self.rpc.policy_dl_rpc(board_to_stream(node.position.board), node.position.get_player_name()) # normalize prior probs prior_probs = normalize_prior_probs(prior_probs) node.generate_edges(prior_probs) act_q_values = np.empty(node.child_num(), dtype=float) for idx, edge in enumerate(node.edges): act_q_values[idx] = edge.edge_weight(self.explore_rate) # act_q_values[idx] = edge.edge_bonus(self.explore_rate) # if node.position.player == RenjuGame.PLAYER_WHITE: # min for white player # act_q_values = -act_q_values best_edge_idx = np.argmax(act_q_values) # move to child node node_parent = node node = node.child[best_edge_idx] # store select track select_track.append(best_edge_idx) print transform_action(node_parent.edges[best_edge_idx].action), ", ", print "]" # create leaf node last_best_edge = node_parent.edges[select_track[-1]] leaf_node_position = node_parent.position.replicate_game() leaf_node_position.step_games(last_best_edge.action) leaf_node = Node(leaf_node_position, parent=node_parent) # leaf_node_parent.child[select_track[-1]] = leaf_node return leaf_node, select_track
def decision(self, action, thread_name): thread = self.threads[thread_name] thread.set_signal(SIGNAL_PAUSE) for idx in xrange(thread.root.child_num()): if thread.root.edges[idx].action == action: if thread.root.child[idx] is None: child_node_position = thread.root.position.replicate_game() child_node_position.step_games( thread.root.edges[idx].action) thread.root.child[idx] = Node(child_node_position, parent=thread.root) thread.root = thread.root.child[idx] break if thread.root.child_num() == 0: prior_probs = self.mcts.rpc.policy_dl_rpc( board_to_stream(thread.root.position.board), thread.root.position.get_player_name()) # normalize prior probs prior_probs = normalize_prior_probs(prior_probs) thread.root.generate_edges(prior_probs) thread.root, action = self.mcts.decision(thread.root) thread.set_signal(SIGNAL_RUNNING) return action
def sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=225): """ :param max_steps: max time steps in games :return: """ sample_games = [] if os.path.exists(sample_file): sample_games = cPickle.load(open(sample_file, 'rb')) logger.info("load sample file: %s, samples=%d" % (sample_file, len(sample_games))) sample_sets = set() # used to check unique sample game = RenjuGame() record_policy_dl_boards = [] # move step by policy dl game.reset_game() record_policy_dl_boards.append(game.replicate_game()) while True: action = game.choose_action( rpc.policy_dl_rpc(board_to_stream(game.board), game.get_player_name())) if action is None: break state, _, terminal = game.step_games(action) if terminal: break record_policy_dl_boards.append(game.replicate_game()) max_time_steps = min(max_time_steps, len(record_policy_dl_boards)) - 1 # sample game while len(sample_games) < sample_num: sampled_game = None while True: # loop to find legal sample flag_time_step = random.randint(1, max_time_steps) recorded_game = record_policy_dl_boards[flag_time_step - 1].replicate_game() random_action = recorded_game.random_action() if random_action is None: break random_state, _, terminal = recorded_game.step_games(random_action) if not terminal and not str(random_state) in sample_sets: sample_sets.add(str(random_state)) break if random_action is None: # invalid loop continue # move step by policy rl time_step = flag_time_step while True: # simulate game by policy rl actions = rpc.policy_rl_rpc(board_to_stream(recorded_game.board), recorded_game.get_player_name()) action = recorded_game.choose_action(actions) if action is None: # game drawn sampled_reward = 0 break state, reward, terminal = recorded_game.step_games(action) time_step += 1 if time_step == (flag_time_step + 1): # record board sampled_game = recorded_game.replicate_game() if terminal: # record value sampled_reward = reward break if sampled_game is not None: sample_games.append((sampled_game, sampled_reward)) logger.info("sample simulate, sample_step=%d, time_step=%d" % (len(sample_games), time_step)) if len(sample_games) % 100 == 0: cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2) logger.info("create value network sample, step=%d" % len(sample_games)) return sample_games
def train_policy_network(self, rpc, batch_games=128, save_step=50000, max_model_pools=5, init_epsilon=0.5, final_epsilon=0.05, explore=1000000, action_repeat=20, mini_batch_size=128): """ data set from self-play :return: """ game = RenjuGame() batch_states, batch_actions, batch_rewards = deque(), deque(), deque() mini_batch_states, mini_batch_actions, mini_batch_rewards = [ 0 ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size model_pools = [] params = self.param_unserierlize(init_params={ "global_step": 0, "epsilon": init_epsilon }) global_step_val, epsilon = params["global_step"], params["epsilon"] train_step = 0 while True: start_time = time.time() # choose policy network for opponent player from model pools if train_step % 10 == 0: if len(model_pools) > 0: model_file = random.choice(model_pools) else: model_file = None rpc.switch_model("policy_rl", model_file=model_file) while len(batch_states) < batch_games: # opponent_policy = self.load_history_policy_model(model_file) black_opponent = random.choice([True, False]) # reset game game.reset_game() # simulate game by current parameter states, actions, rewards = [], [], [] state = game.step_games(None) while True: # loop current game # self-play, current model V.S. history model if random.random() < epsilon: # random choose action action = game.random_action() else: if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \ or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE): action = game.choose_action( rpc.policy_rl_rpc(board_to_stream(game.board), game.get_player_name())) else: # current player action = game.choose_action( self.predict([state])[0]) # step game state_n, reward_n, terminal_n = game.step_games(action) # print "game=", batch_step, ", move=", transform_action(action) # store (state, action) states.append(state) one_hot_act = one_hot_action(action) actions.append(one_hot_act) # set new states state = state_n if terminal_n: final_reward = reward_n # logger.info("winner=%s" % ("black" if reward_n > 0 else "white")) break # check whether game drawn if game.random_action( ) is None: # game drawn, equal end, reward=0 final_reward = 0 logger.info("game drawn, so amazing...") break # store (reward) for step in xrange(len(states)): if step % 2 == 0: rewards.append(final_reward) else: rewards.append(-final_reward) # store states of ith game batch_states.append(states) batch_actions.append(actions) batch_rewards.append(rewards) # fit model by mini batch avg_loss, avg_acc = 0.0, 0.0 for _ in xrange(action_repeat): train_step += 1 for idx in xrange(mini_batch_size): game_idx = random.randint(0, len(batch_states) - 1) game_time_step_idx = random.randint( 0, len(batch_states[game_idx]) - 1) mini_batch_states[idx] = batch_states[game_idx][ game_time_step_idx] mini_batch_actions[idx] = batch_actions[game_idx][ game_time_step_idx] mini_batch_rewards[idx] = batch_rewards[game_idx][ game_time_step_idx] _, global_step_val, loss, acc = self.fit(mini_batch_states, mini_batch_actions, mini_batch_rewards, fetch_info=True) avg_loss += loss avg_acc += acc # update epsilon if epsilon > final_epsilon: epsilon -= (init_epsilon - final_epsilon) / explore avg_loss /= action_repeat avg_acc /= action_repeat batch_states.popleft() batch_actions.popleft() batch_rewards.popleft() global_step_val = int(global_step_val) elapsed_time = int(time.time() - start_time) logger.info( "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" % (global_step_val, epsilon, avg_loss, avg_acc, elapsed_time)) # save model if train_step % save_step == 0: params["global_step"], params[ "epsilon"] = global_step_val, epsilon self.param_serierlize(params) model_file = self.save_model("policy_rl", global_step=global_step_val) logger.info("save policy dl model, file=%s" % model_file) model_file = model_file[len(self.model_dir):] # add history model to pool model_pools.append(model_file) if len(model_pools ) > max_model_pools: # pop head when model pools exceed model_pools.pop(0) logger.info("model pools has files: [%s]" % (", ".join(model_pools)))
def train_rl_network(batch_games=128, save_step=10000, max_model_pools=5, init_epsilon=0.5, final_epsilon=0.01, explore=1000000, action_repeat=32, mini_batch_size=64): """ data set from self-play :return: """ args = parser_argument().parse_args() rpc = ModelRPC(args) game = RenjuGame() batch_states, batch_actions, batch_rewards = deque(), deque(), deque() mini_batch_states, mini_batch_actions, mini_batch_rewards = [ 0 ] * mini_batch_size, [0] * mini_batch_size, [0] * mini_batch_size model_pools = [] param_file = "%s/param.json" % train_dir params = param_unserierlize(param_file, init_params={ "global_step": 0, "epsilon": init_epsilon }) global_step_val, epsilon = params["global_step"], params["epsilon"] # load model sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network( ) train_step = 0 while True: start_time = time.time() # choose policy network for opponent player from model pools if train_step % 10 == 0: if len(model_pools) > 0: model_file = random.choice(model_pools) else: model_file = None rpc.switch_model("policy_rl", model_file=model_file) while len(batch_states) < batch_games: # opponent_policy = self.load_history_policy_model(model_file) black_opponent = random.choice([True, False]) # reset game game.reset_game() # simulate game by current parameter states, actions, rewards = [], [], [] state = game.step_games(None) while True: # loop current game # self-play, current model V.S. history model if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \ or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE): predict_probs = rpc.policy_rl_rpc( board_to_stream(game.board), game.get_player_name()) else: # current player predict_probs = sess.run( [tower_logits[0]], feed_dict={tower_feeds[0][0]: [state]})[0][0] if random.random() < epsilon: # random choose action action = game.weighted_choose_action(predict_probs) else: action = game.choose_action(predict_probs) if action is None: final_reward = 0 break # step game state_n, reward_n, terminal_n = game.step_games(action) # store (state, action) states.append(state) actions.append(action) # set new states state = state_n if terminal_n: final_reward = reward_n break # check whether game drawn if game.random_action( ) is None: # game drawn, equal end, reward=0 final_reward = 0 logger.info("game drawn, so amazing...") break # store (reward) for step in xrange(len(states)): if step % 2 == 0: rewards.append(final_reward) else: rewards.append(-final_reward) # store states of ith game batch_states.append(states) batch_actions.append(actions) batch_rewards.append(rewards) # fit model by mini batch avg_loss, avg_acc = 0.0, 0.0 for _ in xrange(action_repeat / gpu_num): train_step += 1 feeds = {} for gpu_id in xrange(gpu_num): for idx in xrange(mini_batch_size): game_idx = random.randint(0, len(batch_states) - 1) game_time_step_idx = random.randint( 0, len(batch_states[game_idx]) - 1) mini_batch_states[idx] = batch_states[game_idx][ game_time_step_idx] mini_batch_actions[idx] = batch_actions[game_idx][ game_time_step_idx] mini_batch_rewards[idx] = batch_rewards[game_idx][ game_time_step_idx] feeds[tower_feeds[gpu_id][0]] = mini_batch_states feeds[tower_feeds[gpu_id][1]] = mini_batch_actions feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards _, global_step_val, loss_val, acc_val = sess.run( [train_op, global_step, loss, accuracy], feed_dict=feeds) avg_loss += loss_val avg_acc += acc_val # update epsilon if epsilon > final_epsilon: epsilon -= (init_epsilon - final_epsilon) / explore avg_loss /= action_repeat avg_acc /= action_repeat batch_states.popleft() batch_actions.popleft() batch_rewards.popleft() global_step_val = int(global_step_val) elapsed_time = int(time.time() - start_time) logger.info( "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" % (train_step, epsilon, avg_loss, avg_acc, elapsed_time)) # save model if train_step % save_step == 0: params["global_step"], params["epsilon"] = global_step_val, epsilon param_serierlize(param_file, params) model_file = save_model(sess, train_dir, saver, "policy_rl_step_%d" % train_step, global_step=global_step_val) logger.info("save policy rl model, file=%s" % model_file) model_file = model_file[len(train_dir):] # add history model to pool model_pools.append(model_file) if len(model_pools ) > max_model_pools: # pop head when model pools exceed model_pools.pop(0) logger.info("model pools has files: [%s]" % (", ".join(model_pools)))
def train_rl_network(batch_games=128, save_step=10000, max_model_pools=5, init_epsilon=0.5, final_epsilon=0.01, explore=1000000, action_repeat=32, mini_batch_size=64): """ data set from self-play :return: """ args = parser_argument().parse_args() rpc = ModelRPC(args) game = RenjuGame() batch_states, batch_actions, batch_rewards = deque(), deque(), deque() mini_batch_states, mini_batch_actions, mini_batch_rewards = [0] * mini_batch_size, [0] * mini_batch_size, [ 0] * mini_batch_size model_pools = [] param_file = "%s/param.json" % train_dir params = param_unserierlize(param_file, init_params={"global_step": 0, "epsilon": init_epsilon}) global_step_val, epsilon = params["global_step"], params["epsilon"] # load model sess, saver, summary_writer, train_op, loss, accuracy, global_step, lr, tower_feeds, tower_logits = network() train_step = 0 while True: start_time = time.time() # choose policy network for opponent player from model pools if train_step % 10 == 0: if len(model_pools) > 0: model_file = random.choice(model_pools) else: model_file = None rpc.switch_model("policy_rl", model_file=model_file) while len(batch_states) < batch_games: # opponent_policy = self.load_history_policy_model(model_file) black_opponent = random.choice([True, False]) # reset game game.reset_game() # simulate game by current parameter states, actions, rewards = [], [], [] state = game.step_games(None) while True: # loop current game # self-play, current model V.S. history model if (black_opponent and game.player == RenjuGame.PLAYER_BLACK) \ or (not black_opponent and game.player == RenjuGame.PLAYER_WHITE): predict_probs = rpc.policy_rl_rpc(board_to_stream(game.board), game.get_player_name()) else: # current player predict_probs = sess.run([tower_logits[0]], feed_dict={tower_feeds[0][0]: [state]})[0][0] if random.random() < epsilon: # random choose action action = game.weighted_choose_action(predict_probs) else: action = game.choose_action(predict_probs) if action is None: final_reward = 0 break # step game state_n, reward_n, terminal_n = game.step_games(action) # store (state, action) states.append(state) actions.append(action) # set new states state = state_n if terminal_n: final_reward = reward_n break # check whether game drawn if game.random_action() is None: # game drawn, equal end, reward=0 final_reward = 0 logger.info("game drawn, so amazing...") break # store (reward) for step in xrange(len(states)): if step % 2 == 0: rewards.append(final_reward) else: rewards.append(-final_reward) # store states of ith game batch_states.append(states) batch_actions.append(actions) batch_rewards.append(rewards) # fit model by mini batch avg_loss, avg_acc = 0.0, 0.0 for _ in xrange(action_repeat / gpu_num): train_step += 1 feeds = {} for gpu_id in xrange(gpu_num): for idx in xrange(mini_batch_size): game_idx = random.randint(0, len(batch_states) - 1) game_time_step_idx = random.randint(0, len(batch_states[game_idx]) - 1) mini_batch_states[idx] = batch_states[game_idx][game_time_step_idx] mini_batch_actions[idx] = batch_actions[game_idx][game_time_step_idx] mini_batch_rewards[idx] = batch_rewards[game_idx][game_time_step_idx] feeds[tower_feeds[gpu_id][0]] = mini_batch_states feeds[tower_feeds[gpu_id][1]] = mini_batch_actions feeds[tower_feeds[gpu_id][2]] = mini_batch_rewards _, global_step_val, loss_val, acc_val = sess.run([train_op, global_step, loss, accuracy], feed_dict=feeds) avg_loss += loss_val avg_acc += acc_val # update epsilon if epsilon > final_epsilon: epsilon -= (init_epsilon - final_epsilon) / explore avg_loss /= action_repeat avg_acc /= action_repeat batch_states.popleft() batch_actions.popleft() batch_rewards.popleft() global_step_val = int(global_step_val) elapsed_time = int(time.time() - start_time) logger.info( "train policy rl network, step=%d, epsilon=%.5f, loss=%.6f, acc=%.6f, time=%d(sec)" % (train_step, epsilon, avg_loss, avg_acc, elapsed_time)) # save model if train_step % save_step == 0: params["global_step"], params["epsilon"] = global_step_val, epsilon param_serierlize(param_file, params) model_file = save_model(sess, train_dir, saver, "policy_rl_step_%d" % train_step, global_step=global_step_val) logger.info("save policy rl model, file=%s" % model_file) model_file = model_file[len(train_dir):] # add history model to pool model_pools.append(model_file) if len(model_pools) > max_model_pools: # pop head when model pools exceed model_pools.pop(0) logger.info("model pools has files: [%s]" % (", ".join(model_pools)))