def act(self, gs: GameState) -> int: gs_unique_id = gs.get_unique_id() available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() mask_vec = np.zeros((self.action_space_size, )) mask_vec[available_actions] = 1.0 v = self.critic.predict(state_vec) p = self.actor.predict(state_vec, mask_vec) p = np.array(p) p /= p.sum() indexes = np.arange(self.action_space_size) chosen_action = np.random.choice(indexes, p=p) # valid_actions_probability = p[available_actions] # valid_actions_probability_sum = np.sum(valid_actions_probability) # normalized_valid_action_probability = valid_actions_probability / valid_actions_probability_sum # # # chosen_action = np.random.choice(available_actions, p=normalized_valid_action_probability) self.v.append(v) self.s.append(state_vec) self.m.append(mask_vec) self.a.append(to_categorical(chosen_action, self.action_space_size)) if not self.is_last_episode_terminal: self.r.append(self.r_temp) self.r_temp = 0.0 self.is_last_episode_terminal = False return chosen_action
def act(self, gs: GameState) -> int: gs_unique_id = gs.get_unique_id() available_actions = gs.get_available_actions(gs.get_active_player()) if gs_unique_id not in self.Q: self.Q[gs_unique_id] = dict() for a in available_actions: self.Q[gs_unique_id][a] = (np.random.random() * 2.0 - 1.0) / 10.0 if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = max(self.Q[gs_unique_id], key=self.Q[gs_unique_id].get) if self.s is not None: self.Q[self.s][self.a] += self.alpha * ( self.r + self.gamma * max(self.Q[gs_unique_id].values()) - self.Q[self.s][self.a]) self.s = gs_unique_id self.a = chosen_action self.r = 0.0 return self.a
def run_to_the_end(agents: List[Agent], gs: GameState, render: bool = False): while not gs.is_game_over(): if render: gs.render() run_step(agents, gs) if render: gs.render()
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) while True: for event in pygame.event.get(): if event.type == pygame.MOUSEBUTTONDOWN: valid_action = gs.get_valid_action_from_mouse_pos( pygame.mouse.get_pos()[0], pygame.mouse.get_pos()[1]) if valid_action in available_actions: return valid_action else: print(f"Incorrect action")
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) print(f"Choose action index from : {available_actions}") while True: try: action_candidate = int(input()) if action_candidate in available_actions: break except Exception as _: pass print(f"Action not valid, please try again !") return action_candidate
def run_for_n_games_and_return_max(agents: List[Agent], gs: GameState, games_count: int, render: bool = False) -> np.ndarray: old_and_new_scores = np.ones((2, len(gs.get_scores()))) * np.NINF for _ in range(games_count): gs_copy = gs.clone() run_to_the_end(agents, gs_copy, render=render) new_scores = gs_copy.get_scores() old_and_new_scores[1, :] = new_scores old_and_new_scores[0, :] = np.max(old_and_new_scores, axis=0) return old_and_new_scores[0, :]
def run_step(agents: List[Agent], gs: GameState): assert not gs.is_game_over() active_player_index = gs.get_active_player() old_scores = gs.get_scores().copy() action = agents[active_player_index].act(gs) gs.step(active_player_index, action) new_scores = gs.get_scores() rewards = new_scores - old_scores for i, agent in enumerate(agents): agent.observe(rewards[i], gs.is_game_over(), i)
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state( mode="2D" if self.using_convolution else None) predicted_Q_values = self.Q.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] if self.s is not None: target = self.r + self.gamma * max( predicted_Q_values[available_actions]) self.Q.train(self.s, self.a, target) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 return chosen_action
def act(self, gs: GameState) -> int: gs_unique_id = gs.get_unique_id() available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = to_categorical(gs_unique_id, gs.get_max_state_count()) predicted_Q_values = self.Q.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] if self.s is not None: target = self.r + self.gamma * max( predicted_Q_values[available_actions]) self.Q.train(self.s, self.a, target) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 return chosen_action
def run_for_n_games_and_return_stats( agents: List[Agent], gs: GameState, games_count: int, shuffle_players: bool = False, render: bool = False, writer=None, show_progress: bool = False, progress_desc: str = None, ) -> (np.ndarray, np.ndarray, float): total_scores = np.zeros_like(gs.get_scores()) total_times = 0 agents_order = np.arange(len(agents)) agents_copy = agents if shuffle_players: agents_copy = agents.copy() iterable = (tqdm.tqdm(range(games_count), progress_desc) if show_progress else range(games_count)) for game in iterable: game = game + 1 gs_copy = gs.clone() if shuffle_players: agents_copy = agents.copy() shuffle(agents_order) for i in agents_order: agents_copy[i] = agents[agents_order[i]] start = time.time() run_to_the_end(agents_copy, gs_copy, render=render) total_times += time.time() - start total_scores += gs_copy.get_scores()[agents_order] if writer: mean_scores = total_scores / game mean_time_per_game = total_times / game write_experiment_row(writer, game, mean_scores, mean_time_per_game) return total_scores, total_scores / games_count, total_times / games_count
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) if self.agents is None: self.agents = [RandomAgent()] * gs.player_count() accumulated_scores = np.zeros((len(available_actions),)) for i, a in enumerate(available_actions): gs_clone = gs.clone() gs_clone.step(gs.get_active_player(), a) if self.determinist_environment: max_scores = run_for_n_games_and_return_max( self.agents, gs_clone, self.epochs_per_action ) accumulated_scores[i] = max_scores[gs.get_active_player()] else: (total_scores, _, _) = run_for_n_games_and_return_stats( self.agents, gs_clone, self.epochs_per_action ) accumulated_scores[i] = total_scores[gs.get_active_player()] # print((accumulated_scores, available_actions[np.argmax(accumulated_scores)])) return available_actions[np.argmax(accumulated_scores)]
def act(self, gs: GameState) -> int: root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: MOMCTSAgent.create_node_in_memory( memory, root_hash, gs.get_available_actions(gs.get_active_player()), gs.get_active_player(), ) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all( (edge["n"] > 0 for edge in memory[s])): chosen_edge = max( ((edge, MOMCTSAgent.ucb_1(edge)) for edge in memory[s]), key=lambda kv: kv[1], )[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"]) s = gs_copy.get_unique_id() if s not in memory: MOMCTSAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), ) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice( list( filter(lambda e: e["n"] == 0, (edge for edge in memory[s])))) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"]) s = gs_copy.get_unique_id() if s not in memory: MOMCTSAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), ) # SIMULATION while not gs_copy.is_game_over(): gs_copy.step( gs_copy.get_active_player(), choice( gs_copy.get_available_actions( gs_copy.get_active_player())), ) scores = gs_copy.get_scores() # REMONTEE DU SCORE for (s, edge) in history: edge["n"] += 1 edge["r"] += scores[edge["p"]] for neighbour_edge in memory[s]: neighbour_edge["np"] += 1 return max((edge for edge in memory[root_hash]), key=lambda e: e["n"])["a"]
def act(self, gs: GameState) -> int: root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: q_values = self.brain.predict(gs.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, root_hash, gs.get_available_actions(gs.get_active_player()), gs.get_active_player(), q_values, ) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all( (edge["n"] > 0 for edge in memory[s])): chosen_edge = max( ((edge, HalfAlphaZeroAgent.ucb_1(edge)) for edge in memory[s]), key=lambda kv: kv[1], )[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"]) s = gs_copy.get_unique_id() if s not in memory: q_values = self.brain.predict( gs_copy.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), q_values, ) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice( list( filter(lambda e: e["n"] == 0, (edge for edge in memory[s])))) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"]) s = gs_copy.get_unique_id() if s not in memory: q_values = self.brain.predict( gs_copy.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), q_values, ) scores = np.zeros(gs_copy.player_count()) scores_set = np.zeros(gs_copy.player_count()) # REMONTEE DU SCORE for (s, edge) in history: if scores_set[edge["p"]] == 0: scores_set[edge["p"]] = 1.0 scores[edge["p"]] = edge["q"] edge["n"] += 1 edge["r"] += scores[edge["p"]] for neighbour_edge in memory[s]: neighbour_edge["np"] += 1 chosen_action = max((edge for edge in memory[root_hash]), key=lambda e: e["n"])["a"] if len(self.states_buffer) > 0: self.rewards_buffer.append(self.intermediate_reward) self.states_buffer.append(gs.get_vectorized_state()) self.actions_buffer.append( to_categorical(chosen_action, gs.get_action_space_size())) self.intermediate_reward = 0.0 return chosen_action
def act(self, gs: GameState) -> int: if self.apprentice_training_count > self.apprentice_training_before_takeover: return gs.get_available_actions(gs.get_active_player())[ np.argmax( self.brain.predict(np.array([gs.get_vectorized_state()]))[0][ gs.get_available_actions(gs.get_active_player()) ] ) ] root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, root_hash, gs.get_available_actions(gs.get_active_player()), gs.get_active_player(), ) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all( (edge["n"] > 0 for edge in memory[s]) ): chosen_edge = max( ((edge, ExpertApprenticeAgent.ucb_1(edge)) for edge in memory[s]), key=lambda kv: kv[1], )[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"]) s = gs_copy.get_unique_id() if s not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions(gs_copy.get_active_player()), gs_copy.get_active_player(), ) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice( list(filter(lambda e: e["n"] == 0, (edge for edge in memory[s]))) ) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge["a"]) s = gs_copy.get_unique_id() if s not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions(gs_copy.get_active_player()), gs_copy.get_active_player(), ) # SIMULATION while not gs_copy.is_game_over(): gs_copy.step( gs_copy.get_active_player(), choice(gs_copy.get_available_actions(gs_copy.get_active_player())), ) scores = gs_copy.get_scores() # REMONTEE DU SCORE for (s, edge) in history: edge["n"] += 1 edge["r"] += scores[edge["p"]] for neighbour_edge in memory[s]: neighbour_edge["np"] += 1 target = np.zeros(gs.get_action_space_size()) for edge in memory[root_hash]: target[edge["a"]] = edge["n"] target /= np.sum(target) self.states_buffer.append(gs.get_vectorized_state()) self.actions_buffer.append(target) if len(self.states_buffer) > 200: self.apprentice_training_count += 1 self.brain.fit( np.array(self.states_buffer), np.array(self.actions_buffer), verbose=0 ) self.states_buffer.clear() self.actions_buffer.clear() if self.apprentice_training_count > self.apprentice_training_before_takeover: print("Apprentice is playing next round") return max((edge for edge in memory[root_hash]), key=lambda e: e["n"])["a"]
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) return np.random.choice(available_actions)