Example #1
0
    def act(self, gs: GameState) -> int:
        gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()

        mask_vec = np.zeros((self.action_space_size, ))
        mask_vec[available_actions] = 1.0

        v = self.critic.predict(state_vec)
        p = self.actor.predict(state_vec, mask_vec)

        indexes = np.arange(self.action_space_size)
        chosen_action = np.random.choice(indexes, p=p)

        # valid_actions_probability = p[available_actions]
        # valid_actions_probability_sum = np.sum(valid_actions_probability)
        # normalized_valid_action_probability = valid_actions_probability / valid_actions_probability_sum
        # #
        # chosen_action = np.random.choice(available_actions, p=normalized_valid_action_probability)

        self.v.append(v)

        self.s.append(state_vec)
        self.m.append(mask_vec)
        self.a.append(to_categorical(chosen_action, self.action_space_size))
        if not self.is_last_episode_terminal:
            self.r.append(self.r_temp)
        self.r_temp = 0.0
        self.is_last_episode_terminal = False

        return chosen_action
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q_action.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * predicted_Q_values[int(
                np.argmax(self.Q_evaluation.predict(self.s)))]
            self.Q_action.train(self.s, self.a, target)

        if self.s is not None:
            update_Q_evaluation = self.tau * np.array(
                self.Q_action.model.get_weights()) + (1 - self.tau) * np.array(
                    self.Q_evaluation.model.get_weights())
            self.Q_evaluation.model.set_weights(update_Q_evaluation)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0
        self.count_state += 1

        return chosen_action
Example #3
0
    def act(self, gs: GameState) -> int:
        #gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * self.alternate_Q.predict(
                state_vec)[available_actions][np.argmax(
                    self.Q.predict(state_vec)[available_actions])]
            # final_target = self.model.predict(state)
            # final_target[0][action] = target
            # self.model.fit(state, final_target, verbose=0)
            self.Q.train(self.s, self.a, target)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0

        return chosen_action
    def act(self, gs: GameState) -> int:
        gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * max(
                predicted_Q_values[available_actions])
            self.Q.train(self.s, self.a, target)
            self.experience.append(
                (self.s.copy(), self.a.copy(), self.r, state_vec.copy()))
        print("experience", len(self.experience))

        if len(self.experience) % 10 == 0:
            for el in self.experience:
                target = el[2] + self.gamma * el[1]
                self.Q.train(el[0], el[1], target)
        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0

        return chosen_action
Example #5
0
    def act(self, gs: GameState) -> int:
        gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())
        if gs_unique_id not in self.Q:
            self.Q[gs_unique_id] = dict()
            for a in available_actions:
                self.Q[gs_unique_id][a] = (np.random.random() * 2.0 -
                                           1.0) / 10.0

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = max(self.Q[gs_unique_id],
                                key=self.Q[gs_unique_id].get)

        if self.s is not None:
            self.Q[self.s][self.a] += \
                self.alpha * (self.r +
                              self.gamma * max(self.Q[gs_unique_id].values()) -
                              self.Q[self.s][self.a])

        self.s = gs_unique_id
        self.a = chosen_action
        self.r = 0.0

        return self.a
Example #6
0
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q_action.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * predicted_Q_values[int(
                np.argmax(self.Q_evaluation.predict(self.s)))]
            self.Q_action.train(self.s, self.a, target)
            self.experience.append(
                (self.s.copy(), self.a.copy(), self.r, state_vec.copy()))

        if len(self.experience) % 10 == 0 and len(
                self.experience) > 0 and self.epsilon > 0:

            el = sample(
                self.experience,
                len(self.experience) if len(self.experience) < 30 else 30)

            dict = {'Exp': el}
            el_state = [x[0] for x in dict['Exp']]
            el_a = [x[1] for x in dict['Exp']]
            el_r = [x[2] for x in dict['Exp']]
            el_state_plus_1 = [x[3] for x in dict['Exp']]

            predicted_Q_values_list = self.Q_action.model.predict(
                np.array(el_state_plus_1))

            dict_predict_Q_value = {'Predict': predicted_Q_values_list}

            Q_star = [
                x[int(np.argmax(self.Q_evaluation.predict(el_state[i])))]
                for i, x in enumerate(dict_predict_Q_value['Predict'])
            ]
            Q_star_np = np.array(Q_star)

            target = np.array(el_r) + self.gamma * Q_star_np

            self.Q_action.retrain(np.array(el_state), np.array(el_a), target)

        if self.s is not None:
            update_Q_evaluation = self.tau * np.array(
                self.Q_action.model.get_weights()) + (1 - self.tau) * np.array(
                    self.Q_evaluation.model.get_weights())
            self.Q_evaluation.model.set_weights(update_Q_evaluation)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0
        self.count_state += 1

        return chosen_action
Example #7
0
    def act(self, gs: GameState) -> int:
        root_hash = gs.get_unique_id()
        memory = self.memory if self.keep_memory else dict()

        if root_hash not in memory:
            MOMCTSAgent.create_node_in_memory(memory, root_hash, gs.get_available_actions(
                gs.get_active_player()), gs.get_active_player()
                                          )

        for i in range(self.max_iteration):
            gs_copy = gs.clone()
            s = gs_copy.get_unique_id()
            history = []

            # SELECTION
            while not gs_copy.is_game_over() and all((edge['n'] > 0
                                                      for edge in memory[s]
                                                      )):
                chosen_edge = max(((edge, MOMCTSAgent.ucb_1(edge))
                                   for edge in memory[s]
                                   ), key=lambda kv: kv[1])[0]
                history.append((s, chosen_edge))

                gs_copy.step(gs_copy.get_active_player(), chosen_edge['a'])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    MOMCTSAgent.create_node_in_memory(memory, s, gs_copy.get_available_actions(
                        gs_copy.get_active_player()
                    ), gs_copy.get_active_player())

            # EXPANSION
            if not gs_copy.is_game_over():
                chosen_edge = choice(list(filter(lambda e: e['n'] == 0, (edge for edge in memory[s]))))

                history.append((s, chosen_edge))
                gs_copy.step(gs_copy.get_active_player(), chosen_edge['a'])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    MOMCTSAgent.create_node_in_memory(memory, s, gs_copy.get_available_actions(
                        gs_copy.get_active_player()
                    ), gs_copy.get_active_player())

            # SIMULATION
            while not gs_copy.is_game_over():
                gs_copy.step(gs_copy.get_active_player(),
                             choice(gs_copy.get_available_actions(gs_copy.get_active_player())))

            scores = gs_copy.get_scores()
            # REMONTEE DU SCORE
            for (s, edge) in history:
                edge['n'] += 1
                edge['r'] += scores[edge['p']]
                for neighbour_edge in memory[s]:
                    neighbour_edge['np'] += 1

        return max((edge for edge in memory[root_hash]), key=lambda e: e['n'])['a']
Example #8
0
def run_for_n_games_and_return_stats(
        agents: List[Agent], gs: GameState,
        games_count: int) -> (np.ndarray, np.ndarray):
    total_scores = np.zeros_like(gs.get_scores())

    for _ in range(games_count):
        gs_copy = gs.clone()
        run_to_the_end(agents, gs_copy)
        total_scores += gs_copy.get_scores()

    return total_scores, total_scores / games_count
Example #9
0
    def act(self, gs: GameState) -> int:
        print(gs)
        available_actions = gs.get_available_actions(gs.get_active_player())

        if self.action in available_actions:
            self.message = "Your turn"
            return self.action
        else:
            self.message = "Action not valid, please try again with : " + str(
                available_actions)
            return -1
Example #10
0
def run_for_n_games_and_return_max(agents: List[Agent], gs: GameState,
                                   games_count: int) -> np.ndarray:
    old_and_new_scores = np.ones((2, len(gs.get_scores()))) * -9999.9

    for _ in range(games_count):
        gs_copy = gs.clone()
        run_to_the_end(agents, gs_copy)
        new_scores = gs_copy.get_scores()
        old_and_new_scores[1, :] = new_scores
        old_and_new_scores[0, :] = np.max(old_and_new_scores, axis=0)

    return old_and_new_scores[0, :]
    def act(self, gs: GameState) -> int:
        self.priority.append(0.001)
        self.memory.append((self.s, self.s, self.a, self.r, True))
        available_actions = gs.get_available_actions(gs.get_active_player())
        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)
        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        batch, importance = self.get_priority_experience_batch()
        for b, i in zip(batch, importance):
            state, next_state, action, reward, done = b
            target = reward
            if not done:
                if self.s is not None:
                    # target = target + self.gamma * self.alternate_Q.predict(state_vec)[available_actions][
                    #                 np.argmax(self.Q.predict(state_vec)[available_actions])]
                    # self.Q.train(self.s, self.a, target)
                    q_next = reward + self.gamma * self.alternate_Q.predict(
                        next_state)[available_actions][np.argmax(
                            self.Q.predict(next_state)[available_actions])]
                    target = q_next
                    q = self.alternate_Q.predict(
                        next_state)[available_actions][np.argmax(
                            self.Q.predict(next_state)[available_actions])]
                    p = (np.abs(q_next - q) + (np.e**-10))**self.alpha
                    self.priority.append(p)
                    self.memory.append(
                        (state, next_state, action, reward, done))
                    self.Q.train(self.s, self.a, target)

            imp = i**(1 - self.epsilon)
            imp = np.reshape(imp, 1)
            # self.remember(self.s, state_vec, self.a, self.r, True)

        # batch = random.choices(self.memory, k=self.batch_size)
        # for state, next_state, action, reward, done in batch:
        #     target = reward
        #     if not done:
        #         if self.s is not None:
        #             target = target + self.gamma * self.alternate_Q.predict(state_vec)[available_actions][
        #                 np.argmax(self.Q.predict(state_vec)[available_actions])]
        #             self.Q.train(self.s, self.a, target)
        #             self.remember(self.s, state_vec, self.a, self.r, True)
        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0
        return chosen_action
    def act(self, gs: GameState) -> int:
        print(gs)
        available_actions = gs.get_available_actions(gs.get_active_player())
        print(f"Choose action index from : {available_actions}")

        while True:
            try:
                action_candidate = int(input())
                if action_candidate in available_actions:
                    break
            except Exception as _:
                pass
            print(f'Action not valid, please try again !')
        return action_candidate
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)
        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * max(
                predicted_Q_values[available_actions])
            self.Q.train(self.s, self.a, target)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0

        return chosen_action
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()

        action_probs = self.Q_policy_function.predict(state_vec)

        chosen_action = np.random.choice(available_actions,
                                         p=action_probs,
                                         replace=True)

        self.state.append(state_vec)
        self.rewards.append(self.r)
        self.log_probs.append(np.log(action_probs))
        self.probs.append(action_probs)
        self.action.append(chosen_action)
        self.a.append(to_categorical(chosen_action, self.action_space_size))

        self.r = 0.0

        return chosen_action
Example #15
0
def run_step(agents: List[Agent], gs: GameState):
    assert (not gs.is_game_over())
    active_player_index = gs.get_active_player()

    old_scores = gs.get_scores().copy()
    action = agents[active_player_index].act(gs)
    gs.step(active_player_index, action)
    new_scores = gs.get_scores()
    rewards = new_scores - old_scores
    for i, agent in enumerate(agents):
        agent.observe(rewards[i], gs.is_game_over(), i)
Example #16
0
def run_for_n_games_and_return_stats(
        agents: List[Agent],
        gs: GameState,
        games_count: int,
        shuffle_players: bool = False) -> (np.ndarray, np.ndarray):
    total_scores = np.zeros_like(gs.get_scores())
    agents_order = np.arange(len(agents))

    agents_copy = agents
    if shuffle_players:
        agents_copy = agents.copy()
    for _ in range(games_count):
        gs_copy = gs.clone()
        if shuffle_players:
            agents_copy = agents.copy()
            shuffle(agents_order)
            for i in agents_order:
                agents_copy[i] = agents[agents_order[i]]
        run_to_the_end(agents_copy, gs_copy)
        total_scores += gs_copy.get_scores()[agents_order]

    return total_scores, total_scores / games_count
 def act(self, gs: GameState) -> int:
     available_actions = gs.get_available_actions(gs.get_active_player())
     state_vec = gs.get_vectorized_state()
     predicted_Q_values = self.Q.predict(state_vec)
     if np.random.random() <= self.epsilon:
         chosen_action = np.random.choice(available_actions)
     else:
         chosen_action = available_actions[int(
             np.argmax(predicted_Q_values[available_actions]))]
     batch = random.choices(self.memory, k=self.batch_size)
     for state, next_state, action, reward, done in batch:
         target = reward
         if not done:
             if self.s is not None:
                 target = target + self.gamma * self.alternate_Q.predict(
                     state_vec)[available_actions][np.argmax(
                         self.Q.predict(state_vec)[available_actions])]
                 self.Q.train(self.s, self.a, target)
                 self.remember(self.s, state_vec, self.a, self.r, True)
     self.s = state_vec
     self.a = to_categorical(chosen_action, self.action_space_size)
     self.r = 0.0
     return chosen_action
    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())
        if self.agents is None:
            self.agents = [RandomAgent()] * gs.player_count()
        accumulated_scores = np.zeros((len(available_actions),))

        for i, a in enumerate(available_actions):
            gs_clone = gs.clone()
            gs_clone.step(gs.get_active_player(), a)
            if self.determinist_environment:
                max_scores = run_for_n_games_and_return_max(self.agents, gs_clone, self.epochs_per_action)
                accumulated_scores[i] = max_scores[gs.get_active_player()]
            else:
                (total_scores, _) = run_for_n_games_and_return_stats(self.agents, gs_clone, self.epochs_per_action)
                accumulated_scores[i] = total_scores[gs.get_active_player()]

        # print((accumulated_scores, available_actions[np.argmax(accumulated_scores)]))
        return available_actions[np.argmax(accumulated_scores)]
    def act(self, gs: GameState) -> int:
        """
        Args:
            board [int] : the current state of the board
        Returns:
            int : the index of the move to be made
        """

        # TODO would it be better to only map to qubits that could be moves
        # However this would mean lots of index/physical qubit number swaps

        board = [-1] * 9
        # print(board)
        # print(gs.get_available_actions(gs.get_active_player()))
        for x in gs.get_available_actions(gs.get_active_player()):
            # print(x)
            board[x] = None

        # print(board)

        num_qubits = 9

        use_ibm = False
        if self.i == 0 and use_ibm:
            IBMQ.save_account('mykey', overwrite=True)

            IBMQ.load_account()
            # provider = IBMQ.get_provider(hub='ibm-q')

            # backend = least_busy(provider.backends(filters=lambda x:
            # x.configuration().n_qubits >= 9 and
            #     not x.configuration().simulator and
            #     x.status().operational == True))
            # print("least busy backend: ", backend)
            self.i += 1

        # reset the T gate counter
        self.num_t_gates = [0] * num_qubits
        # Reserve des adresses mémoire afin de stocker 9 qbits
        q = QuantumRegister(num_qubits)
        # Le classicalregister permet definir un registre pour stocker les résultats, dans se cas 9 qbits qui
        # correspondent à l'ensemble des cases du jeu
        c = ClassicalRegister(num_qubits)
        # Création du circuit
        qc = QuantumCircuit(q, c)

        # make the option definitely a 1 if there is a move in the space already
        for index, move in enumerate(board):

            if move:
                # delete action from possibilities
                qc.x(q[index])
            else:
                # this space is a potential move
                # so put into a superposition
                qc.h(q[index])

                t_count = 0

                # so it is the start of a row
                if index % 3 == 0:

                    # two pieces in a row - need to block/win
                    if board[index + 1] and board[index + 2] and board[
                            index + 1] == board[index + 2]:
                        qc.t(q[index])
                        qc.t(q[index])
                        t_count += 2
                    # only one of the spaces is occupied (^ is xor, but they both have to explicitly be bools)
                    elif bool(board[index + 1]) ^ bool(board[index + 2]):
                        qc.t(q[index])
                        t_count += 1

                # so it is the middle of a row
                if index % 3 == 1:

                    # two pieces in a row - need to block/win
                    if board[index + 1] and board[index - 1] and board[
                            index + 1] == board[index - 1]:
                        qc.t(q[index])
                        qc.t(q[index])
                        t_count += 2
                    elif bool(board[index + 1]) ^ bool(board[index - 1]):
                        qc.t(q[index])
                        t_count += 1

                # so it is the end of a row
                if index % 3 == 2:
                    # two pieces in a row - need to block/win
                    if board[index - 1] and board[index - 2] and board[
                            index - 1] == board[index - 2]:
                        qc.t(q[index])
                        qc.t(q[index])
                        t_count += 2
                    elif bool(board[index - 1]) ^ bool(board[index - 2]):
                        qc.t(q[index])
                        t_count += 1

                # so is the top row
                if index / 3 < 1:
                    if board[index + 3] and board[index + 6] and board[
                            index + 3] == board[index + 6]:
                        qc.t(q[index])
                        qc.t(q[index])
                        t_count += 2
                    elif bool(board[index + 3]) ^ bool(board[index + 6]):
                        qc.t(q[index])
                        t_count += 1

                # so it is the middle row
                if 2 > index / 3 >= 1:
                    if board[index - 3] and board[index + 3] and board[
                            index - 3] == board[index + 3]:
                        qc.t(q[index])
                        qc.t(q[index])
                        t_count += 2
                    elif bool(board[index - 3]) ^ bool(board[index + 3]):
                        qc.t(q[index])
                        t_count += 1

                # so it is the top row
                if index / 3 >= 2:
                    if board[index - 3] and board[index - 6] and board[
                            index - 3] == board[index - 6]:
                        qc.t(q[index])
                        qc.t(q[index])
                        t_count += 2
                    elif bool(board[index - 3]) ^ bool(board[index - 6]):
                        qc.t(q[index])
                        t_count += 1

                self.num_t_gates[index] = t_count

        # hard code in the diagonals
        if board[0] and board[0] == board[4]:
            qc.t(q[8])
            qc.t(q[8])
            qc.t(q[8])
            self.num_t_gates[8] += 3
        if board[0] and board[0] == board[8]:
            qc.t(q[4])
            qc.t(q[4])
            qc.t(q[4])
            self.num_t_gates[4] += 3
        if board[4] and board[4] == board[8]:
            qc.t(q[0])
            qc.t(q[0])
            qc.t(q[0])
            self.num_t_gates[0] += 3
        if board[2] and board[2] == board[4]:
            qc.t(q[6])
            qc.t(q[6])
            qc.t(q[6])
            self.num_t_gates[6] += 3
        if board[2] and board[2] == board[6]:
            qc.t(q[4])
            qc.t(q[4])
            qc.t(q[4])
            self.num_t_gates[4] += 3
        if board[4] and board[4] == board[6]:
            qc.t(q[2])
            qc.t(q[2])
            qc.t(q[2])
            self.num_t_gates[2] += 3

        for index, move in enumerate(board):
            if not move:
                qc.h(q[index])
            else:
                # if there is already a move there - don't show that any t gates were applied
                self.num_t_gates[index] = -1
        qc.measure(q, c)

        backend = Aer.get_backend('qasm_simulator')
        logger.info(
            "Made the circuit, running it on the backend: {}".format(backend))
        shots = 100
        # job_sim = execute(qc, backend, shots=shots)
        job_sim = execute(qc, backend=backend, shots=shots)
        sim_result = job_sim.result().get_counts(qc)

        job_monitor(job_sim, interval=2)

        counts = [0] * num_qubits

        for key, count in sim_result.items():
            # need to iterate over the results and see when the value was chosen most

            # keys are the opposite way round to expected
            key = key[::-1]
            for index, val in enumerate(key):
                if val == '1':
                    counts[index] += count

        max_count = 0
        max_index = 0
        for index, count in enumerate(counts):
            if not board[index] and count > max_count:
                max_index = index
                max_count = count

        self.move = max_index
        results = job_sim.result()
        answer = results.get_counts(qc)
        '''
        plot_circuit = qc.draw(output="mpl")
        plot_hist = plot_histogram(answer)
        plot_circuit = qc.draw(output="mpl")
        plot_hist.show()
        plot_circuit.show()
        plot_hist.savefig('quantum_grover_agent_histogram.png')
        '''

        # logger.info("Quantum choice = {}".format(self.move))
        return self.move
Example #20
0
    def act(self, gs: GameState) -> int:
        """
        Args:
            board [int] : the current state of the board
        Returns:
            int : the index of the move to be made
        """

        board = [-1] * 9

        for x in gs.get_available_actions(gs.get_active_player()):
            print(x)
            board[x] = None

        # superpositon of potential moves
        groverCircuit, registers = self._board_to_superposition(board)

        # this means the only available space is the last one
        if isinstance(groverCircuit, int):
            self.move = groverCircuit
            return

        QuantumGroverAgent._oracle(groverCircuit, registers[0])
        QuantumGroverAgent._inversion_about_average(groverCircuit,
                                                    registers[0], 3)
        groverCircuit.measure(registers[0], registers[1])

        # run circuit
        backend = Aer.get_backend('qasm_simulator')
        shots = 1024
        results = execute(groverCircuit, backend=backend, shots=shots).result()
        answer = results.get_counts(groverCircuit)
        '''
        plot_hist = plot_histogram(answer)
        plot_circuit = groverCircuit.draw(output="mpl")
        plot_hist.show()
        plot_circuit.show()
        plot_hist.savefig('quantum_grover_agent_histogram.png')
        '''
        answer = results.get_counts()
        reversed_answer = {}

        # reverse all the keys
        for state, count in answer.items():
            reversed_answer[state[::-1]] = count

        print(reversed_answer)

        # get the highest move
        max_count = 0
        winning_state = ''
        for state, count in reversed_answer.items():
            if count > max_count:
                max_count = count
                winning_state = state

        print('The move is ', winning_state, ' which is the same as ',
              str(int(winning_state, 2)))

        self.move = int(winning_state, 2)
        self.counts = reversed_answer

        # shouldn't happen? but just in case
        if board[int(winning_state, 2)]:
            spaces = [i for i, mv in enumerate(board) if mv is None]
            self.move = random.choice(spaces)

        # logger.info("Quantum choice = {}".format(self.move))
        return self.move
Example #21
0
 def act(self, gs: GameState) -> int:
     available_actions = gs.get_available_actions(gs.get_active_player())
     return np.random.choice(available_actions)
Example #22
0
def run_to_the_end(agents: List[Agent], gs: GameState):
    while not gs.is_game_over():
        run_step(agents, gs)
    def act(self, gs: GameState) -> int:

        if self.apprentice_training_count > self.apprentice_training_before_takeover:
            return gs.get_available_actions(gs.get_active_player())[np.argmax(
                self.brain.predict(np.array([
                    gs.get_vectorized_state()
                ]))[0][gs.get_available_actions(gs.get_active_player())])]

        root_hash = gs.get_unique_id()
        memory = self.memory if self.keep_memory else dict()

        if root_hash not in memory:
            ExpertApprenticeAgent.create_node_in_memory(
                memory, root_hash,
                gs.get_available_actions(gs.get_active_player()),
                gs.get_active_player())

        for i in range(self.max_iteration):
            gs_copy = gs.clone()
            s = gs_copy.get_unique_id()
            history = []

            # SELECTION
            while not gs_copy.is_game_over() and all(
                (edge['n'] > 0 for edge in memory[s])):
                chosen_edge = max(((edge, ExpertApprenticeAgent.ucb_1(edge))
                                   for edge in memory[s]),
                                  key=lambda kv: kv[1])[0]
                history.append((s, chosen_edge))

                gs_copy.step(gs_copy.get_active_player(), chosen_edge['a'])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    ExpertApprenticeAgent.create_node_in_memory(
                        memory, s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player())

            # EXPANSION
            if not gs_copy.is_game_over():
                chosen_edge = choice(
                    list(
                        filter(lambda e: e['n'] == 0,
                               (edge for edge in memory[s]))))

                history.append((s, chosen_edge))
                gs_copy.step(gs_copy.get_active_player(), chosen_edge['a'])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    ExpertApprenticeAgent.create_node_in_memory(
                        memory, s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player())

            # SIMULATION
            while not gs_copy.is_game_over():
                gs_copy.step(
                    gs_copy.get_active_player(),
                    choice(
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player())))

            scores = gs_copy.get_scores()
            # REMONTEE DU SCORE
            for (s, edge) in history:
                edge['n'] += 1
                edge['r'] += scores[edge['p']]
                for neighbour_edge in memory[s]:
                    neighbour_edge['np'] += 1

        target = np.zeros(gs.get_action_space_size())

        for edge in memory[root_hash]:
            target[edge['a']] = edge['n']

        target /= np.sum(target)

        self.states_buffer.append(gs.get_vectorized_state())
        self.actions_buffer.append(target)

        if len(self.states_buffer) > 200:
            self.apprentice_training_count += 1
            self.brain.fit(np.array(self.states_buffer),
                           np.array(self.actions_buffer))
            self.states_buffer.clear()
            self.actions_buffer.clear()

        if self.apprentice_training_count > self.apprentice_training_before_takeover:
            print('Apprentice is playing next round')

        return max((edge for edge in memory[root_hash]),
                   key=lambda e: e['n'])['a']
Example #24
0
    def act(self, gs: GameState) -> int:
        root_hash = gs.get_unique_id()
        memory = self.memory if self.keep_memory else dict()

        if root_hash not in memory:
            q_values = self.brain.predict(gs.get_vectorized_state())
            HalfAlphaZeroAgent.create_node_in_memory(
                memory, root_hash,
                gs.get_available_actions(gs.get_active_player()),
                gs.get_active_player(), q_values)

        for i in range(self.max_iteration):
            gs_copy = gs.clone()
            s = gs_copy.get_unique_id()
            history = []

            # SELECTION
            while not gs_copy.is_game_over() and all(
                (edge['n'] > 0 for edge in memory[s])):
                chosen_edge = max(((edge, HalfAlphaZeroAgent.ucb_1(edge))
                                   for edge in memory[s]),
                                  key=lambda kv: kv[1])[0]
                history.append((s, chosen_edge))

                gs_copy.step(gs_copy.get_active_player(), chosen_edge['a'])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    q_values = self.brain.predict(
                        gs_copy.get_vectorized_state())
                    HalfAlphaZeroAgent.create_node_in_memory(
                        memory, s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player(), q_values)

            # EXPANSION
            if not gs_copy.is_game_over():
                chosen_edge = choice(
                    list(
                        filter(lambda e: e['n'] == 0,
                               (edge for edge in memory[s]))))

                history.append((s, chosen_edge))
                gs_copy.step(gs_copy.get_active_player(), chosen_edge['a'])
                s = gs_copy.get_unique_id()
                if s not in memory:
                    q_values = self.brain.predict(
                        gs_copy.get_vectorized_state())
                    HalfAlphaZeroAgent.create_node_in_memory(
                        memory, s,
                        gs_copy.get_available_actions(
                            gs_copy.get_active_player()),
                        gs_copy.get_active_player(), q_values)

            scores = np.zeros(gs_copy.player_count())
            scores_set = np.zeros(gs_copy.player_count())
            # REMONTEE DU SCORE
            for (s, edge) in history:
                if scores_set[edge['p']] == 0:
                    scores_set[edge['p']] = 1.0
                    scores[edge['p']] = edge['q']

                edge['n'] += 1
                edge['r'] += scores[edge['p']]
                for neighbour_edge in memory[s]:
                    neighbour_edge['np'] += 1

        chosen_action = max((edge for edge in memory[root_hash]),
                            key=lambda e: e['n'])['a']

        if len(self.states_buffer) > 0:
            self.rewards_buffer.append(self.intermediate_reward)

        self.states_buffer.append(gs.get_vectorized_state())
        self.actions_buffer.append(
            to_categorical(chosen_action, gs.get_action_space_size()))
        self.intermediate_reward = 0.0

        return chosen_action