Esempio n. 1
0
    def run(self):
        """Game playing process
        for online gaming, the process will be spawn using child_process in Node.js
        communication is done with web socket

        Arguments:
        None

        Returns:
        None
        """
        state = State()
        s = state.get_initial_state()
        c_player = 1
        while True:
            action = self.get_action(state, state.player)
            self._AI._tree.step(encode_action(action))
            logger.debug("position: {} {}".format(action[0], action[1]))
            flag, new_s, R = state.take_action(*action)
            v = self._evaluate([s, s], [c_player, -c_player])
            new_v = self._evaluate([new_s, new_s], [c_player, -c_player])
            self._update([
                s,
                s,
            ], [1, -1], [
                TD(v[0], new_v[0], R, self._AI.alpha, self._AI.gamma),
                TD(v[1], new_v[1], -R, self._AI.alpha, self._AI.gamma)
            ])
            if state.terminate(): break
            s = new_s
            c_player *= -1
        for i in [-1, 0, 1]:
            if state.win(i): return i
Esempio n. 2
0
 def generate_action(self, state):
     move = get_winning_move(state, self._player)
     if len(move) > 0: return move[0][1], move[0][2]
     move = get_winning_move(state, self._opponent)
     if len(move) > 0: return move[0][1], move[0][2]
     env = State(state)
     actions = [i for i in range(16) if env.valid_action(*decode_action(i))]
     # random.shuffle(actions)
     return_action = decode_action(random.choice(actions))
     return return_action[0], return_action[1]
Esempio n. 3
0
 def get_action(self, state, player):
     """play n_playout playouts, choose action greedily on the basis of visits"""
     for n in range(self._n_playout):
         # if n % 100 == 0: logger.debug("playout: {}".format(n))
         n_state = State(state)
         self._playout(n_state)
     # for _id, node in self._root._children.items():
     # logger.debug("id = {}, value = {}".format(_id, node._get_value(self._c)))
     return max(self._root._children.items(),
                key=lambda child: child[1]._get_value(self._c))[0]
Esempio n. 4
0
    def get_action(self, c_state, player):
        """get action which minimize opponent's maximum reward

        Arguments:
        c_state -- a copy of current state
        player -- current player

        Returns:
        a number in [1, 16] denoting the selected action
        """
        val, action = self._search(State(c_state), player, self._search_depth)
        return action
Esempio n. 5
0
    def _get_value(self, data):
        """calculate the value of states
        
        Arguments:
        data -- a list of states series tuple

        Returns:
        x -- process the data into a list of (state, player) tuple
        y -- expected value for each (state, player) tuple
        """
        x, y = [], []
        for dat in data:
            for rotate_time in range(4):
                states, rewards = [], {1: [], -1: []}
                s = State()
                c_player = 1
                for (height, row, col) in dat:
                    height, row, col = self._rotate_data(height, row, col, rotate_time)
                    flag, _, r = s.take_action(row, col)
                    rewards[c_player].append(r)
                    rewards[-c_player].append(-r)
                    c_player *= -1
                reward_sum = {1: [0 for i in range(len(rewards[1]) + 1)], -1: [0 for i in range(len(rewards[1]) + 1)]}
                for i in range(len(rewards[1])):
                    reward_sum[1][len(rewards[1]) - i - 1] = reward_sum[1][len(rewards[1]) - i] * self.gamma + \
                                                                rewards[1][len(rewards[1]) - i - 1]
                    reward_sum[-1][len(rewards[-1]) - i - 1] = reward_sum[-1][len(rewards[-1]) - i] * self.gamma + \
                                                                rewards[-1][len(rewards[-1]) - i - 1]
                s = State()
                c_player = 1
                ind = 0
                for (height, row, col) in dat:
                    height, row, col = self._rotate_data(height, row, col, rotate_time)
                    x.append((np.array(s.get_state()), 1))
                    y.append(reward_sum[1][ind])
                    x.append((np.array(s.get_state()), -1))
                    y.append(reward_sum[-1][ind])
                    s.take_action(row, col)
                    ind += 1
        return x, y
Esempio n. 6
0
    def get_action(self, state, player):
        """get action, return AI's action if it's AI's turn, read input from stdin otherwise

        Arguments:
        state -- current state
        player -- current player

        Returns:
        a (row, col) pair denoting action
        """
        n_state = State(state)
        if player == self.player:
            action = self._AI.get_action(n_state)
            emit_action(action)
            return action
        return self.read_action()
Esempio n. 7
0
    def _search(self,
                c_state,
                player,
                depth,
                max_level=True,
                alpha=-np.inf,
                beta=np.inf):
        """recursively search every possible action and evaluate the leaf nodes

        Arguments:
        c_state -- a copy of current state
        player -- current player
        depth -- depth left for furthur searching
        max_level -- whether the level is maximizer
        alpha -- best value along the path from root to maximizer
        beta -- worst value along the path from root to minimizer

        Returns:
        return_val -- value of this node
        return_action -- where return_val comes from
        """
        if c_state.terminate():
            if c_state.win(player): return 1, None
            if c_state.win(-player): return -1, None
            return 0, None
        if depth == 0:
            return self._evaluate([c_state.get_state()], [player]), None
        comp = (lambda a, b: max(a, b)) if max_level else (
            lambda a, b: min(a, b))
        return_val = -np.inf if max_level else np.inf
        return_action = -1
        actions = [
            i for i in range(16) if c_state.valid_action(*decode_action(i))
        ]
        random.shuffle(actions)
        for i in actions:
            n_state = State(c_state)
            r, c = decode_action(i)
            n_state.take_action(r, c)
            val, action = self._search(n_state, -player, depth - 1,
                                       not max_level, alpha, beta)
            return_val = comp(return_val, val)
            if val == return_val: return_action = i
            alpha, beta, prune = self._alpha_beta_pruning(
                return_val, max_level, alpha, beta)
            if prune: break
        return return_val, return_action
Esempio n. 8
0
    def train(self):
        """reinforcement training process
        
        Arguments:
        None

        Returns:
        None
        """
        percentage = 0
        logger.info("[Reinforcement] Start Training")
        logger.info("[Reinforcement] Training Complete: 0%")
        for epoch in range(self._n_epoch):
            state = [State() for _ in range(4)]
            s = [state[_].get_initial_state() for _ in range(4)]
            c_player = 1
            while True:
                action = self._get_action(state[0], c_player)
                flag, new_s, R = zip(*[
                    state[i].take_action(*(self._rotate(*action, i)))
                    for i in range(4)
                ])
                v = self._evaluate(s + s, [c_player for i in range(4)] +
                                   [-c_player for i in range(4)])
                new_v = self._evaluate(new_s + new_s,
                                       [c_player for i in range(4)] +
                                       [-c_player for i in range(4)])
                self._update(
                    *self._concat_training_data(s, v, new_v, R, c_player))
                self._AI.step(encode_action(action))
                self._opponent.step(encode_action(action))
                if state[0].terminate(): break
                s = new_s
                c_player *= -1
            if epoch / self._n_epoch > percentage / 100:
                percentage = math.ceil(epoch / self._n_epoch * 100)
                logger.info('[Reinforcement] Training Complete: {}%'.format(
                    percentage))
            if percentage % 10 == 0: self._store()
            self._AI.refresh()
            self._opponent.refresh()
        logger.debug('[Reinforcement] Training Complete: 100%')
        self._store()
Esempio n. 9
0
 def test(self):
     result = {1: 0, -1: 0}
     percentage = 0
     logger.info("[Test] Testing Complete: 0%")
     t0 = time.time()
     for epoch in range(self._n_epoch):
         state = State()
         while True:
             action = self._get_action(state, state.player)
             flag, _, R = state.take_action(*action)
             if state.terminate(): break
             self._step(encode_action(action))
         for i in [1, -1]:
             if state.win(i): result[i] += 1
         if epoch / self._n_epoch > percentage / 100:
             percentage = math.ceil(epoch / self._n_epoch * 100)
             logger.info("[Test] Testing Complete: {}%".format(percentage))
         self._AI.refresh()
     logger.info("[Test] Testing Complete: 100%")
     return result[1], result[-1], time.time() - t0
Esempio n. 10
0
 def _get_action(self, state, player):
     n_state = State(state)
     if player == self._player: return self._AI.get_action(n_state)
     return self._bot.generate_action(n_state)