def act(self, gs: GameState) -> int: gs_unique_id = gs.get_unique_id() available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() mask_vec = np.zeros((self.action_space_size, )) mask_vec[available_actions] = 1.0 v = self.critic.predict(state_vec) p = self.actor.predict(state_vec, mask_vec) indexes = np.arange(self.action_space_size) chosen_action = np.random.choice(indexes, p=p) # valid_actions_probability = p[available_actions] # valid_actions_probability_sum = np.sum(valid_actions_probability) # normalized_valid_action_probability = valid_actions_probability / valid_actions_probability_sum # # # chosen_action = np.random.choice(available_actions, p=normalized_valid_action_probability) self.v.append(v) self.s.append(state_vec) self.m.append(mask_vec) self.a.append(to_categorical(chosen_action, self.action_space_size)) if not self.is_last_episode_terminal: self.r.append(self.r_temp) self.r_temp = 0.0 self.is_last_episode_terminal = False return chosen_action
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() predicted_Q_values = self.Q_action.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] if self.s is not None: target = self.r + self.gamma * predicted_Q_values[int( np.argmax(self.Q_evaluation.predict(self.s)))] self.Q_action.train(self.s, self.a, target) if self.s is not None: update_Q_evaluation = self.tau * np.array( self.Q_action.model.get_weights()) + (1 - self.tau) * np.array( self.Q_evaluation.model.get_weights()) self.Q_evaluation.model.set_weights(update_Q_evaluation) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 self.count_state += 1 return chosen_action
def act(self, gs: GameState) -> int: #gs_unique_id = gs.get_unique_id() available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() predicted_Q_values = self.Q.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] if self.s is not None: target = self.r + self.gamma * self.alternate_Q.predict( state_vec)[available_actions][np.argmax( self.Q.predict(state_vec)[available_actions])] # final_target = self.model.predict(state) # final_target[0][action] = target # self.model.fit(state, final_target, verbose=0) self.Q.train(self.s, self.a, target) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 return chosen_action
def act(self, gs: GameState) -> int: gs_unique_id = gs.get_unique_id() available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() predicted_Q_values = self.Q.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] if self.s is not None: target = self.r + self.gamma * max( predicted_Q_values[available_actions]) self.Q.train(self.s, self.a, target) self.experience.append( (self.s.copy(), self.a.copy(), self.r, state_vec.copy())) print("experience", len(self.experience)) if len(self.experience) % 10 == 0: for el in self.experience: target = el[2] + self.gamma * el[1] self.Q.train(el[0], el[1], target) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 return chosen_action
def act(self, gs: GameState) -> int: gs_unique_id = gs.get_unique_id() available_actions = gs.get_available_actions(gs.get_active_player()) if gs_unique_id not in self.Q: self.Q[gs_unique_id] = dict() for a in available_actions: self.Q[gs_unique_id][a] = (np.random.random() * 2.0 - 1.0) / 10.0 if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = max(self.Q[gs_unique_id], key=self.Q[gs_unique_id].get) if self.s is not None: self.Q[self.s][self.a] += \ self.alpha * (self.r + self.gamma * max(self.Q[gs_unique_id].values()) - self.Q[self.s][self.a]) self.s = gs_unique_id self.a = chosen_action self.r = 0.0 return self.a
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() predicted_Q_values = self.Q_action.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] if self.s is not None: target = self.r + self.gamma * predicted_Q_values[int( np.argmax(self.Q_evaluation.predict(self.s)))] self.Q_action.train(self.s, self.a, target) self.experience.append( (self.s.copy(), self.a.copy(), self.r, state_vec.copy())) if len(self.experience) % 10 == 0 and len( self.experience) > 0 and self.epsilon > 0: el = sample( self.experience, len(self.experience) if len(self.experience) < 30 else 30) dict = {'Exp': el} el_state = [x[0] for x in dict['Exp']] el_a = [x[1] for x in dict['Exp']] el_r = [x[2] for x in dict['Exp']] el_state_plus_1 = [x[3] for x in dict['Exp']] predicted_Q_values_list = self.Q_action.model.predict( np.array(el_state_plus_1)) dict_predict_Q_value = {'Predict': predicted_Q_values_list} Q_star = [ x[int(np.argmax(self.Q_evaluation.predict(el_state[i])))] for i, x in enumerate(dict_predict_Q_value['Predict']) ] Q_star_np = np.array(Q_star) target = np.array(el_r) + self.gamma * Q_star_np self.Q_action.retrain(np.array(el_state), np.array(el_a), target) if self.s is not None: update_Q_evaluation = self.tau * np.array( self.Q_action.model.get_weights()) + (1 - self.tau) * np.array( self.Q_evaluation.model.get_weights()) self.Q_evaluation.model.set_weights(update_Q_evaluation) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 self.count_state += 1 return chosen_action
def act(self, gs: GameState) -> int: root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: MOMCTSAgent.create_node_in_memory(memory, root_hash, gs.get_available_actions( gs.get_active_player()), gs.get_active_player() ) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all((edge['n'] > 0 for edge in memory[s] )): chosen_edge = max(((edge, MOMCTSAgent.ucb_1(edge)) for edge in memory[s] ), key=lambda kv: kv[1])[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: MOMCTSAgent.create_node_in_memory(memory, s, gs_copy.get_available_actions( gs_copy.get_active_player() ), gs_copy.get_active_player()) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice(list(filter(lambda e: e['n'] == 0, (edge for edge in memory[s])))) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: MOMCTSAgent.create_node_in_memory(memory, s, gs_copy.get_available_actions( gs_copy.get_active_player() ), gs_copy.get_active_player()) # SIMULATION while not gs_copy.is_game_over(): gs_copy.step(gs_copy.get_active_player(), choice(gs_copy.get_available_actions(gs_copy.get_active_player()))) scores = gs_copy.get_scores() # REMONTEE DU SCORE for (s, edge) in history: edge['n'] += 1 edge['r'] += scores[edge['p']] for neighbour_edge in memory[s]: neighbour_edge['np'] += 1 return max((edge for edge in memory[root_hash]), key=lambda e: e['n'])['a']
def run_for_n_games_and_return_stats( agents: List[Agent], gs: GameState, games_count: int) -> (np.ndarray, np.ndarray): total_scores = np.zeros_like(gs.get_scores()) for _ in range(games_count): gs_copy = gs.clone() run_to_the_end(agents, gs_copy) total_scores += gs_copy.get_scores() return total_scores, total_scores / games_count
def act(self, gs: GameState) -> int: print(gs) available_actions = gs.get_available_actions(gs.get_active_player()) if self.action in available_actions: self.message = "Your turn" return self.action else: self.message = "Action not valid, please try again with : " + str( available_actions) return -1
def run_for_n_games_and_return_max(agents: List[Agent], gs: GameState, games_count: int) -> np.ndarray: old_and_new_scores = np.ones((2, len(gs.get_scores()))) * -9999.9 for _ in range(games_count): gs_copy = gs.clone() run_to_the_end(agents, gs_copy) new_scores = gs_copy.get_scores() old_and_new_scores[1, :] = new_scores old_and_new_scores[0, :] = np.max(old_and_new_scores, axis=0) return old_and_new_scores[0, :]
def act(self, gs: GameState) -> int: self.priority.append(0.001) self.memory.append((self.s, self.s, self.a, self.r, True)) available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() predicted_Q_values = self.Q.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] batch, importance = self.get_priority_experience_batch() for b, i in zip(batch, importance): state, next_state, action, reward, done = b target = reward if not done: if self.s is not None: # target = target + self.gamma * self.alternate_Q.predict(state_vec)[available_actions][ # np.argmax(self.Q.predict(state_vec)[available_actions])] # self.Q.train(self.s, self.a, target) q_next = reward + self.gamma * self.alternate_Q.predict( next_state)[available_actions][np.argmax( self.Q.predict(next_state)[available_actions])] target = q_next q = self.alternate_Q.predict( next_state)[available_actions][np.argmax( self.Q.predict(next_state)[available_actions])] p = (np.abs(q_next - q) + (np.e**-10))**self.alpha self.priority.append(p) self.memory.append( (state, next_state, action, reward, done)) self.Q.train(self.s, self.a, target) imp = i**(1 - self.epsilon) imp = np.reshape(imp, 1) # self.remember(self.s, state_vec, self.a, self.r, True) # batch = random.choices(self.memory, k=self.batch_size) # for state, next_state, action, reward, done in batch: # target = reward # if not done: # if self.s is not None: # target = target + self.gamma * self.alternate_Q.predict(state_vec)[available_actions][ # np.argmax(self.Q.predict(state_vec)[available_actions])] # self.Q.train(self.s, self.a, target) # self.remember(self.s, state_vec, self.a, self.r, True) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 return chosen_action
def act(self, gs: GameState) -> int: print(gs) available_actions = gs.get_available_actions(gs.get_active_player()) print(f"Choose action index from : {available_actions}") while True: try: action_candidate = int(input()) if action_candidate in available_actions: break except Exception as _: pass print(f'Action not valid, please try again !') return action_candidate
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() predicted_Q_values = self.Q.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] if self.s is not None: target = self.r + self.gamma * max( predicted_Q_values[available_actions]) self.Q.train(self.s, self.a, target) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 return chosen_action
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() action_probs = self.Q_policy_function.predict(state_vec) chosen_action = np.random.choice(available_actions, p=action_probs, replace=True) self.state.append(state_vec) self.rewards.append(self.r) self.log_probs.append(np.log(action_probs)) self.probs.append(action_probs) self.action.append(chosen_action) self.a.append(to_categorical(chosen_action, self.action_space_size)) self.r = 0.0 return chosen_action
def run_step(agents: List[Agent], gs: GameState): assert (not gs.is_game_over()) active_player_index = gs.get_active_player() old_scores = gs.get_scores().copy() action = agents[active_player_index].act(gs) gs.step(active_player_index, action) new_scores = gs.get_scores() rewards = new_scores - old_scores for i, agent in enumerate(agents): agent.observe(rewards[i], gs.is_game_over(), i)
def run_for_n_games_and_return_stats( agents: List[Agent], gs: GameState, games_count: int, shuffle_players: bool = False) -> (np.ndarray, np.ndarray): total_scores = np.zeros_like(gs.get_scores()) agents_order = np.arange(len(agents)) agents_copy = agents if shuffle_players: agents_copy = agents.copy() for _ in range(games_count): gs_copy = gs.clone() if shuffle_players: agents_copy = agents.copy() shuffle(agents_order) for i in agents_order: agents_copy[i] = agents[agents_order[i]] run_to_the_end(agents_copy, gs_copy) total_scores += gs_copy.get_scores()[agents_order] return total_scores, total_scores / games_count
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) state_vec = gs.get_vectorized_state() predicted_Q_values = self.Q.predict(state_vec) if np.random.random() <= self.epsilon: chosen_action = np.random.choice(available_actions) else: chosen_action = available_actions[int( np.argmax(predicted_Q_values[available_actions]))] batch = random.choices(self.memory, k=self.batch_size) for state, next_state, action, reward, done in batch: target = reward if not done: if self.s is not None: target = target + self.gamma * self.alternate_Q.predict( state_vec)[available_actions][np.argmax( self.Q.predict(state_vec)[available_actions])] self.Q.train(self.s, self.a, target) self.remember(self.s, state_vec, self.a, self.r, True) self.s = state_vec self.a = to_categorical(chosen_action, self.action_space_size) self.r = 0.0 return chosen_action
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) if self.agents is None: self.agents = [RandomAgent()] * gs.player_count() accumulated_scores = np.zeros((len(available_actions),)) for i, a in enumerate(available_actions): gs_clone = gs.clone() gs_clone.step(gs.get_active_player(), a) if self.determinist_environment: max_scores = run_for_n_games_and_return_max(self.agents, gs_clone, self.epochs_per_action) accumulated_scores[i] = max_scores[gs.get_active_player()] else: (total_scores, _) = run_for_n_games_and_return_stats(self.agents, gs_clone, self.epochs_per_action) accumulated_scores[i] = total_scores[gs.get_active_player()] # print((accumulated_scores, available_actions[np.argmax(accumulated_scores)])) return available_actions[np.argmax(accumulated_scores)]
def act(self, gs: GameState) -> int: """ Args: board [int] : the current state of the board Returns: int : the index of the move to be made """ # TODO would it be better to only map to qubits that could be moves # However this would mean lots of index/physical qubit number swaps board = [-1] * 9 # print(board) # print(gs.get_available_actions(gs.get_active_player())) for x in gs.get_available_actions(gs.get_active_player()): # print(x) board[x] = None # print(board) num_qubits = 9 use_ibm = False if self.i == 0 and use_ibm: IBMQ.save_account('mykey', overwrite=True) IBMQ.load_account() # provider = IBMQ.get_provider(hub='ibm-q') # backend = least_busy(provider.backends(filters=lambda x: # x.configuration().n_qubits >= 9 and # not x.configuration().simulator and # x.status().operational == True)) # print("least busy backend: ", backend) self.i += 1 # reset the T gate counter self.num_t_gates = [0] * num_qubits # Reserve des adresses mémoire afin de stocker 9 qbits q = QuantumRegister(num_qubits) # Le classicalregister permet definir un registre pour stocker les résultats, dans se cas 9 qbits qui # correspondent à l'ensemble des cases du jeu c = ClassicalRegister(num_qubits) # Création du circuit qc = QuantumCircuit(q, c) # make the option definitely a 1 if there is a move in the space already for index, move in enumerate(board): if move: # delete action from possibilities qc.x(q[index]) else: # this space is a potential move # so put into a superposition qc.h(q[index]) t_count = 0 # so it is the start of a row if index % 3 == 0: # two pieces in a row - need to block/win if board[index + 1] and board[index + 2] and board[ index + 1] == board[index + 2]: qc.t(q[index]) qc.t(q[index]) t_count += 2 # only one of the spaces is occupied (^ is xor, but they both have to explicitly be bools) elif bool(board[index + 1]) ^ bool(board[index + 2]): qc.t(q[index]) t_count += 1 # so it is the middle of a row if index % 3 == 1: # two pieces in a row - need to block/win if board[index + 1] and board[index - 1] and board[ index + 1] == board[index - 1]: qc.t(q[index]) qc.t(q[index]) t_count += 2 elif bool(board[index + 1]) ^ bool(board[index - 1]): qc.t(q[index]) t_count += 1 # so it is the end of a row if index % 3 == 2: # two pieces in a row - need to block/win if board[index - 1] and board[index - 2] and board[ index - 1] == board[index - 2]: qc.t(q[index]) qc.t(q[index]) t_count += 2 elif bool(board[index - 1]) ^ bool(board[index - 2]): qc.t(q[index]) t_count += 1 # so is the top row if index / 3 < 1: if board[index + 3] and board[index + 6] and board[ index + 3] == board[index + 6]: qc.t(q[index]) qc.t(q[index]) t_count += 2 elif bool(board[index + 3]) ^ bool(board[index + 6]): qc.t(q[index]) t_count += 1 # so it is the middle row if 2 > index / 3 >= 1: if board[index - 3] and board[index + 3] and board[ index - 3] == board[index + 3]: qc.t(q[index]) qc.t(q[index]) t_count += 2 elif bool(board[index - 3]) ^ bool(board[index + 3]): qc.t(q[index]) t_count += 1 # so it is the top row if index / 3 >= 2: if board[index - 3] and board[index - 6] and board[ index - 3] == board[index - 6]: qc.t(q[index]) qc.t(q[index]) t_count += 2 elif bool(board[index - 3]) ^ bool(board[index - 6]): qc.t(q[index]) t_count += 1 self.num_t_gates[index] = t_count # hard code in the diagonals if board[0] and board[0] == board[4]: qc.t(q[8]) qc.t(q[8]) qc.t(q[8]) self.num_t_gates[8] += 3 if board[0] and board[0] == board[8]: qc.t(q[4]) qc.t(q[4]) qc.t(q[4]) self.num_t_gates[4] += 3 if board[4] and board[4] == board[8]: qc.t(q[0]) qc.t(q[0]) qc.t(q[0]) self.num_t_gates[0] += 3 if board[2] and board[2] == board[4]: qc.t(q[6]) qc.t(q[6]) qc.t(q[6]) self.num_t_gates[6] += 3 if board[2] and board[2] == board[6]: qc.t(q[4]) qc.t(q[4]) qc.t(q[4]) self.num_t_gates[4] += 3 if board[4] and board[4] == board[6]: qc.t(q[2]) qc.t(q[2]) qc.t(q[2]) self.num_t_gates[2] += 3 for index, move in enumerate(board): if not move: qc.h(q[index]) else: # if there is already a move there - don't show that any t gates were applied self.num_t_gates[index] = -1 qc.measure(q, c) backend = Aer.get_backend('qasm_simulator') logger.info( "Made the circuit, running it on the backend: {}".format(backend)) shots = 100 # job_sim = execute(qc, backend, shots=shots) job_sim = execute(qc, backend=backend, shots=shots) sim_result = job_sim.result().get_counts(qc) job_monitor(job_sim, interval=2) counts = [0] * num_qubits for key, count in sim_result.items(): # need to iterate over the results and see when the value was chosen most # keys are the opposite way round to expected key = key[::-1] for index, val in enumerate(key): if val == '1': counts[index] += count max_count = 0 max_index = 0 for index, count in enumerate(counts): if not board[index] and count > max_count: max_index = index max_count = count self.move = max_index results = job_sim.result() answer = results.get_counts(qc) ''' plot_circuit = qc.draw(output="mpl") plot_hist = plot_histogram(answer) plot_circuit = qc.draw(output="mpl") plot_hist.show() plot_circuit.show() plot_hist.savefig('quantum_grover_agent_histogram.png') ''' # logger.info("Quantum choice = {}".format(self.move)) return self.move
def act(self, gs: GameState) -> int: """ Args: board [int] : the current state of the board Returns: int : the index of the move to be made """ board = [-1] * 9 for x in gs.get_available_actions(gs.get_active_player()): print(x) board[x] = None # superpositon of potential moves groverCircuit, registers = self._board_to_superposition(board) # this means the only available space is the last one if isinstance(groverCircuit, int): self.move = groverCircuit return QuantumGroverAgent._oracle(groverCircuit, registers[0]) QuantumGroverAgent._inversion_about_average(groverCircuit, registers[0], 3) groverCircuit.measure(registers[0], registers[1]) # run circuit backend = Aer.get_backend('qasm_simulator') shots = 1024 results = execute(groverCircuit, backend=backend, shots=shots).result() answer = results.get_counts(groverCircuit) ''' plot_hist = plot_histogram(answer) plot_circuit = groverCircuit.draw(output="mpl") plot_hist.show() plot_circuit.show() plot_hist.savefig('quantum_grover_agent_histogram.png') ''' answer = results.get_counts() reversed_answer = {} # reverse all the keys for state, count in answer.items(): reversed_answer[state[::-1]] = count print(reversed_answer) # get the highest move max_count = 0 winning_state = '' for state, count in reversed_answer.items(): if count > max_count: max_count = count winning_state = state print('The move is ', winning_state, ' which is the same as ', str(int(winning_state, 2))) self.move = int(winning_state, 2) self.counts = reversed_answer # shouldn't happen? but just in case if board[int(winning_state, 2)]: spaces = [i for i, mv in enumerate(board) if mv is None] self.move = random.choice(spaces) # logger.info("Quantum choice = {}".format(self.move)) return self.move
def act(self, gs: GameState) -> int: available_actions = gs.get_available_actions(gs.get_active_player()) return np.random.choice(available_actions)
def run_to_the_end(agents: List[Agent], gs: GameState): while not gs.is_game_over(): run_step(agents, gs)
def act(self, gs: GameState) -> int: if self.apprentice_training_count > self.apprentice_training_before_takeover: return gs.get_available_actions(gs.get_active_player())[np.argmax( self.brain.predict(np.array([ gs.get_vectorized_state() ]))[0][gs.get_available_actions(gs.get_active_player())])] root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, root_hash, gs.get_available_actions(gs.get_active_player()), gs.get_active_player()) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all( (edge['n'] > 0 for edge in memory[s])): chosen_edge = max(((edge, ExpertApprenticeAgent.ucb_1(edge)) for edge in memory[s]), key=lambda kv: kv[1])[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player()) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice( list( filter(lambda e: e['n'] == 0, (edge for edge in memory[s])))) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: ExpertApprenticeAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player()) # SIMULATION while not gs_copy.is_game_over(): gs_copy.step( gs_copy.get_active_player(), choice( gs_copy.get_available_actions( gs_copy.get_active_player()))) scores = gs_copy.get_scores() # REMONTEE DU SCORE for (s, edge) in history: edge['n'] += 1 edge['r'] += scores[edge['p']] for neighbour_edge in memory[s]: neighbour_edge['np'] += 1 target = np.zeros(gs.get_action_space_size()) for edge in memory[root_hash]: target[edge['a']] = edge['n'] target /= np.sum(target) self.states_buffer.append(gs.get_vectorized_state()) self.actions_buffer.append(target) if len(self.states_buffer) > 200: self.apprentice_training_count += 1 self.brain.fit(np.array(self.states_buffer), np.array(self.actions_buffer)) self.states_buffer.clear() self.actions_buffer.clear() if self.apprentice_training_count > self.apprentice_training_before_takeover: print('Apprentice is playing next round') return max((edge for edge in memory[root_hash]), key=lambda e: e['n'])['a']
def act(self, gs: GameState) -> int: root_hash = gs.get_unique_id() memory = self.memory if self.keep_memory else dict() if root_hash not in memory: q_values = self.brain.predict(gs.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, root_hash, gs.get_available_actions(gs.get_active_player()), gs.get_active_player(), q_values) for i in range(self.max_iteration): gs_copy = gs.clone() s = gs_copy.get_unique_id() history = [] # SELECTION while not gs_copy.is_game_over() and all( (edge['n'] > 0 for edge in memory[s])): chosen_edge = max(((edge, HalfAlphaZeroAgent.ucb_1(edge)) for edge in memory[s]), key=lambda kv: kv[1])[0] history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: q_values = self.brain.predict( gs_copy.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), q_values) # EXPANSION if not gs_copy.is_game_over(): chosen_edge = choice( list( filter(lambda e: e['n'] == 0, (edge for edge in memory[s])))) history.append((s, chosen_edge)) gs_copy.step(gs_copy.get_active_player(), chosen_edge['a']) s = gs_copy.get_unique_id() if s not in memory: q_values = self.brain.predict( gs_copy.get_vectorized_state()) HalfAlphaZeroAgent.create_node_in_memory( memory, s, gs_copy.get_available_actions( gs_copy.get_active_player()), gs_copy.get_active_player(), q_values) scores = np.zeros(gs_copy.player_count()) scores_set = np.zeros(gs_copy.player_count()) # REMONTEE DU SCORE for (s, edge) in history: if scores_set[edge['p']] == 0: scores_set[edge['p']] = 1.0 scores[edge['p']] = edge['q'] edge['n'] += 1 edge['r'] += scores[edge['p']] for neighbour_edge in memory[s]: neighbour_edge['np'] += 1 chosen_action = max((edge for edge in memory[root_hash]), key=lambda e: e['n'])['a'] if len(self.states_buffer) > 0: self.rewards_buffer.append(self.intermediate_reward) self.states_buffer.append(gs.get_vectorized_state()) self.actions_buffer.append( to_categorical(chosen_action, gs.get_action_space_size())) self.intermediate_reward = 0.0 return chosen_action