def pazaak(): b = PazaakBoard() player_side_deck = [ Card(x) for x in [choice(range(1, 7)) for _ in range(4)] ] opponent_side_deck = [ Card(x) for x in [choice(range(1, 7)) for _ in range(4)] ] players = [ PazaakPlayer(player=1, side_deck=player_side_deck), PazaakPlayer(player=2, side_deck=opponent_side_deck) ] state = PazaakState(board=b, players=players, player=players[0]) while b.status(players=state.players) == -1: if not state.player.stand: state = state.random_card() node = Node(state=state) tree = Mcts(root=node) print(">>>>> CURR PLAYER: <<<<<<<", state.player.player) b = tree.find_next_move(100) state = PazaakState(board=b, player=state.player, players=state.players, player_index=state.player_index) print("TURN\n") b.print()
def simulate(self, game: Game, state: State): """ Used to run all the tree calls in one step :param game: Game :param state: State """ root_node = Node(None, state, game.current_player) for i in range(self.m): child = self.tree_search(game, root_node) self.expand(game, child.state) fin = self.do_random_walk(game, child) self.backprop(child, fin.player, 1)
def simulate(self, node: Node): curr = node board_status = curr.state.board.status(players=self.root.state.players) while board_status == constants.IN_PROGRESS: curr.state.player = curr.state.toggle_player_new() new_state = curr.state.random_play() curr = Node(state=new_state) board_status = curr.state.board.status(players=curr.state.players) return board_status
def get_next_board(): global app_state if app_state.board.status(players=app_state.players) == -1: if not app_state.player.stand: app_state = app_state.random_card() node = Node(state=app_state) tree = Mcts(root=node) app_state.board = tree.find_next_move(100) app_state = PazaakState(board=app_state.board, player=app_state.player, players=app_state.players, player_index=app_state.player_index) return app_state.board return app_state.board
def tic_tac_toe(): b = TicTacBoard() players = [TicTacPlayer(1), TicTacPlayer(2)] state = TicTacState(board=b, player=players[0], players=players) while b.status() == -1: node = Node(state=state) tree = Mcts(root=node) print(">>>>> CURR PLAYER: <<<<<<<", state.player.player) b = tree.find_next_move(100) state = TicTacState(board=b, player=state.player, players=state.players, player_index=state.player_index) print("TURN\n") b.print()
def expand(self, node): not_visited_actions = deepcopy(node.allowed_actions) for child in node.children: not_visited_actions.remove(child.previous_action) #TODO: check if this should be random or chosen by player policy chosen_action = random.choice(tuple(not_visited_actions)) schafkopf_env = gym.make("Schafkopf-v1") schafkopf_env.setGameState(deepcopy(node.game_state), deepcopy(node.player_hands)) schafkopf_env.stepTest(chosen_action) # state, rewards, terminal, info new_node = Node( parent=node, game_state=deepcopy(schafkopf_env.getGameState()), previous_action=chosen_action, player_hands=deepcopy(schafkopf_env.getCards()), allowed_actions=schafkopf_env.test_game.getOptionsList()) node.add_child(child_node=new_node) return new_node
def gen_child_nodes(self, node: Node): p = self.get_next_player(node.player) nodes = [Node(node, s, p) for s in self.gen_child_states(node.state)] return nodes
class MonteCarloTree: ''' Inspired by https://github.com/Taschee/schafkopf/blob/master/schafkopf/players/uct_player.py ''' def __init__(self, game_state, player_hands, allowed_actions, ucb_const=1): self.root = Node(None, None, game_state, player_hands, allowed_actions) self.ucb_const = ucb_const def uct_search(self, num_playouts): for _ in range(num_playouts): selected_node = self.selection() rewards = self.simulation(selected_node) self.backup_rewards(leaf_node=selected_node, rewards=rewards) results = [] for child in self.root.children: results.append( (child.previous_action, child.visits, child.get_average_reward(self.root.game_state["cp"]))) return results def selection(self): current_node = self.root while not current_node.is_terminal(): if not current_node.fully_expanded(): return self.expand(current_node) else: current_node = current_node.best_child( ucb_const=self.ucb_const) return current_node def expand(self, node): not_visited_actions = deepcopy(node.allowed_actions) for child in node.children: not_visited_actions.remove(child.previous_action) #TODO: check if this should be random or chosen by player policy chosen_action = random.choice(tuple(not_visited_actions)) schafkopf_env = gym.make("Schafkopf-v1") schafkopf_env.setGameState(deepcopy(node.game_state), deepcopy(node.player_hands)) schafkopf_env.stepTest(chosen_action) # state, rewards, terminal, info new_node = Node( parent=node, game_state=deepcopy(schafkopf_env.getGameState()), previous_action=chosen_action, player_hands=deepcopy(schafkopf_env.getCards()), allowed_actions=schafkopf_env.test_game.getOptionsList()) node.add_child(child_node=new_node) return new_node def simulation(self, selected_node): schafkopf_env = gym.make("Schafkopf-v1") gameOver = deepcopy(selected_node.game_state)["gameOver"] schafkopf_env.setGameState(deepcopy(selected_node.game_state), deepcopy(selected_node.player_hands)) while not gameOver: rewards, round_finished, gameOver = schafkopf_env.test_game.playUntilEnd( ) return rewards["final_rewards"] def backup_rewards(self, leaf_node, rewards): current_node = leaf_node while current_node != self.root: current_node.update_rewards(rewards) current_node.update_visits() current_node = current_node.parent self.root.update_visits() def get_action_count_rewards(self): result = {} for child in self.root.children: if isinstance(child.previous_action, list): result[tuple( child.previous_action)] = (child.visits, child.cumulative_rewards) else: result[child.previous_action] = (child.visits, child.cumulative_rewards) return result
def __init__(self, game_state, player_hands, allowed_actions, ucb_const=1): self.root = Node(None, None, game_state, player_hands, allowed_actions) self.ucb_const = ucb_const
def simulate(sureWinReg, sureWinPolicy, targetSet, initState, traj, inference): max_time_steps = ARG.levels simulation_times = ARG.simulation_times levels = ARG.levels num_sims = ARG.num_sims sure_winning_regions = load_sure_winning_regions(sureWinReg) sure_winning_policy = load_pre_computed_policy(sureWinPolicy) ctrl_env = ControllableEnvironment() ad_env = AdversarialEnvironment() eva = Evaluation(ctrl_env, ad_env) root_state = initState "create a tree" tree = MonteCarloTreeSearch(sure_winning_regions, ctrl_env, ad_env) inference_online = inference s = root_state print("| The initial state is: ", s) contr_s = s[0] ad_s = s[1] joint_traj = traj actions = [] touch = False if np.linalg.norm(np.array(contr_s) - np.array(ad_s), ord=1) <= 1: print(' ') print('| Sorry! The Blue agent is caught!') # print("redo the simulate") elif contr_s in targetSet: print(' ') print('| Congrats!!! The blue agent reaches the true goal!!!') print('------------------------------------') elif contr_s + ad_s in sure_winning_regions: current_joint_state = joint_traj[-1] if len(current_joint_state) > 1: inference_online.update_traj(current_joint_state, False, disp_flag=False) # inference_online.update_traj(joint_traj, False, disp_flag=False) if touch is False: print(' ') print('| Congrats!!! The blue agent reaches the sure winning regions!!!') print('------------------------------------') # contr_a = inference_online.inference_learning.A[sure_winning_policy[(contr_s, ad_s)]] contr_a = ctrl_env.A_full[sure_winning_policy[(contr_s, ad_s)]] else: # inference_online.update_traj(joint_traj, False, disp_flag=False) current_joint_state = joint_traj[-1] if len(current_joint_state) > 1: inference_online.update_traj(current_joint_state, False, disp_flag=False) current_node = Node(History([joint_traj], levels, sure_winning_regions, ctrl_env, ad_env)) best_child = tree.uct_search(num_sims / float(levels), current_node, targetSet) # create the current root contr_a = best_child.history.value[-1] contr_n_s = ctrl_env.sto_trans(contr_s, tuple(contr_a)) ad_a = inference_online.get_ad_action(s) ad_n_s = ad_env.sto_trans(ad_s, ad_a) contr_s = contr_n_s ad_s = ad_n_s s = (contr_n_s, ad_n_s) print(" ") print("| The current state is :", s) print("before append:", joint_traj) joint_traj.append(s) print("after append:", joint_traj) actions.append(contr_a) print(inference_online.inference_learning.P_h_g) return s, joint_traj, inference_online
def load_parameters_mcts(config): """ Load previous tree, data and info if the already exist otherwise create new ones Initialise Scorer and locks for multithreading :return: None """ with open('mcts/configurations/' + config + ".json") as c: config = json.load(c) p.config = config p.directory = 'data_out/' + config['configuration_name'] + '/' p.f_tree = p.directory + "tree.pckl" p.f_info = p.directory + "info.json" p.f_data = p.directory + "data.json" p.f_stat = p.directory + "stat.csv" p.r_dft = p.directory + "dft/" p.f_stop = p.directory + "stop.txt" p.lock_update_data = threading.Lock() p.lock_update_node = threading.Lock() p.lock_sa_score = threading.Lock() if not os.path.isdir(p.directory) or not os.path.isfile(p.f_tree) or not os.path.isfile(p.f_info)\ or not os.path.isfile(p.f_stat) or not os.path.isfile(p.f_data): if not os.path.isdir(p.directory): os.mkdir(p.directory) if not os.path.isdir(p.r_dft): os.mkdir(p.r_dft) with open(p.f_stop, 'w') as stop: stop.write("") with open(p.f_stat, 'w') as stat: stat.write("") print("New tree") p.tree_info = dict() p.tree_info[p.info_created] = 0 p.tree_info[p.info_good] = 0 p.tree_info[p.info_bad] = 0 p.tree_info[p.info_alrd_tested] = 0 p.data = dict() if config['from'] == 'root': p.tree = Node(SMILES(config['prefix'])) else: p.tree = Node() p.turn = 0 else: print("Loading previous tree, data and info") with open(p.f_tree, 'rb') as f: pickler = pickle.Unpickler(f) p.tree = pickler.load() while p.tree.parent: p.tree = p.tree.parent with open(p.f_data, 'r') as f: p.data = json.load(f) with open(p.f_info, 'r') as f: p.tree_info = json.load(f) p.turn = sum(1 for line in open(p.f_stat)) if config['from'] == 'root': if p.tree.smiles.element != config['prefix']: raise Exception("Root node different from previous execution") with open('rnn_models/' + p.config['rnn_repertory'] + "/config.json") as info_rnn: p.tokens = json.load(info_rnn)['tokens'] data_base.load_data_base() reset_score_visit(p.tree) p.scorer = getattr( __import__(p.config['scorer'][0], fromlist=[p.config['scorer'][1]]), p.config['scorer'][1])(p.config['alpha_scorer']) load_scores() p.models = load_model(p.config['rnn_repertory'])
def expand_node(self, node: Node, player: Player): player = node.state.toggle_player_new() possible_states = node.state.get_all_states(player) for state in possible_states: node.children.append(Node(state=state, parent=node))
def simulate(sureWinReg, sureWinPolicy, targetSet, initState, traj, inference): max_time_steps = ARG.levels simulation_times = ARG.simulation_times levels = ARG.levels # this needs to be smaller than the number defined in the state num_sims = ARG.num_sims 'Global Variables' sure_winning_regions = load_sure_winning_regions(sureWinReg) sure_winning_policy = load_pre_computed_policy(sureWinPolicy) "Create two environments" ctrl_env = ControllableEnvironment() ad_env = AdversarialEnvironment() root_state = initState s = root_state "create a tree" tree = MonteCarloTreeSearch(sure_winning_regions, ctrl_env, ad_env) inference_online =inference print(" ") print("| This state is: ", s) contr_s = s[0] ad_s = s[1] joint_traj = traj actions = [] inference_online.reset_inference(joint_traj) print(' ') print("belief:", inference_online.inference_learning.P_h_g) touch = False if np.linalg.norm(np.array(contr_s) - np.array(ad_s), ord=1) <= 1: print(' ') print('| Sorry! The Blue agent is caught!') elif contr_s in targetSet: print(' ') print('| Congrats!!! The blue agent reaches the temp goal!!!') print('------------------------------------') elif contr_s + ad_s in sure_winning_regions: if touch is False: print(' ') print('| Congrats!!! The blue agent reaches the sure winning regions!!!') print('------------------------------------') touch = True contr_a = ctrl_env.A_full[sure_winning_policy[(contr_s, ad_s)]] else: current_node = Node(History([joint_traj], levels, sure_winning_regions, ctrl_env, ad_env)) tree.reset_tree() best_child = tree.uct_search(num_sims / float(levels), current_node) # create the current root contr_a = best_child.history.value[-1] contr_n_s = ctrl_env.sto_trans(contr_s, tuple(contr_a)) ad_a = inference_online.get_ad_action(s) # ad_a = inference_online.get_ad_max_action(s) # ad_a = inference_online.get_ad_pursuit_action(s) ad_n_s = ad_env.sto_trans(ad_s, ad_a) contr_s = contr_n_s ad_s = ad_n_s s = (contr_n_s, ad_n_s) print(" ") print("| The current state is :", s) joint_traj.append(s) inference_online.update_traj(joint_traj, disp_flag=False) print(' ') print(inference_online.inference_learning.P_h_g) # print(inference_online.inference_learning.total_KL) actions.append(contr_a) return s, joint_traj, inference_online