def _expansion_simulation(self, leaf_id, win_index): leaf_board = self.tree[leaf_id]['board'] current_player = self.tree[leaf_id]['player'] if win_index == 0: # expansion actions = utils.valid_actions(leaf_board) for action in actions: action_index = action[1] child_id = leaf_id + (action_index, ) child_board = utils.get_board(child_id, self.board_size) next_turn = utils.get_turn(child_id) self.tree[child_id] = { 'board': child_board, 'player': next_turn, 'parent': leaf_id, 'child': [], 'n': 0., 'w': 0., 'q': 0. } self.tree[leaf_id]['child'].append(action_index) if self.tree[leaf_id]['parent']: # simulation board_sim = leaf_board.copy() turn_sim = current_player while True: actions_sim = utils.valid_actions(board_sim) action_sim = actions_sim[np.random.choice( len(actions_sim))] coord_sim = action_sim[0] if turn_sim == 0: board_sim[coord_sim] = 1 else: board_sim[coord_sim] = -1 win_idx_sim = utils.check_win(board_sim, self.win_mark) if win_idx_sim == 0: turn_sim = abs(turn_sim - 1) else: reward = utils.get_reward(win_idx_sim, leaf_id) return reward else: # root node don't simulation reward = 0. return reward else: # terminal node don't expansion reward = 1. return reward
def get_pi(self, root_id, board, turn, tau): self.root_id = root_id action = utils.valid_actions(board) prob = 1 / len(action) pi = np.zeros(self.board_size**2, 'float') for loc, idx in action: pi[idx] = prob return pi
def simulation(self, tree, child_id): state = deepcopy(tree[child_id]['state']) player = deepcopy(tree[child_id]['player']) while True: win = check_win(state, self.win_mark) if win != 0: return win else: actions = valid_actions(state) action = random.choice(actions) if player == 0: player = 1 state[action[0]] = 1 else: player = 0 state[action[0]] = -1
def expansion(self, tree, leaf_id): leaf_state = deepcopy(tree[leaf_id]['state']) is_terminal = check_win(leaf_state, self.win_mark) actions = valid_actions(leaf_state) expand_thres = 10 if leaf_id == (0, ) or tree[leaf_id]['n'] > expand_thres: is_expand = True else: is_expand = False if is_terminal == 0 and is_expand: # expansion for every possible actions childs = [] for action in actions: state = deepcopy(tree[leaf_id]['state']) action_index = action[1] current_player = tree[leaf_id]['player'] if current_player == 0: next_turn = 1 state[action[0]] = 1 else: next_turn = 0 state[action[0]] = -1 child_id = leaf_id + (action_index, ) childs.append(child_id) tree[child_id] = { 'state': state, 'player': next_turn, 'child': [], 'parent': leaf_id, 'n': 0, 'w': 0, 'q': 0 } tree[leaf_id]['child'].append(action_index) child_id = random.sample(childs, 1) return tree, child_id[0] else: # If leaf node is terminal state, # just return MCTS tree return tree, leaf_id
h.add_only_observation(observation) print('Action from POCMP: ', action, 'Real observation: ', observation) #Save the 'old' particle list to update afterwards old_particle_list = copy.deepcopy( pomcp.tree.nodes[pomcp.tree.root_key].particle_list) #print('tamanho old list: ', len(old_particle_list)) pomcp.tree.prune_and_make_new_root(action, observation) #print('Historico oficial') #h.print_history() state_from_history, _ = simulator.get_dummy_state_and_legal_actions_given_history( h) #Now update the belief state pomcp.tree.nodes[pomcp.tree.root_key].particle_list = particle_list_update( simulator, old_particle_list, int(pomcp.n_simulations), state_from_history, action, observation, 100) if len(pomcp.tree.nodes[pomcp.tree.root_key].particle_list) == 0: break print('Out of particles, finishing episode with SelectRandom') time = 0 while time < 100: action = choice(valid_actions(real_initial_state)) successor_state, observation, reward, is_terminal = simulator.step( real_initial_state, action) if is_terminal: print('Finished') break h.add(action, observation) print('Historico oficial') h.print_history()