def run(self): """Game playing process for online gaming, the process will be spawn using child_process in Node.js communication is done with web socket Arguments: None Returns: None """ state = State() s = state.get_initial_state() c_player = 1 while True: action = self.get_action(state, state.player) self._AI._tree.step(encode_action(action)) logger.debug("position: {} {}".format(action[0], action[1])) flag, new_s, R = state.take_action(*action) v = self._evaluate([s, s], [c_player, -c_player]) new_v = self._evaluate([new_s, new_s], [c_player, -c_player]) self._update([ s, s, ], [1, -1], [ TD(v[0], new_v[0], R, self._AI.alpha, self._AI.gamma), TD(v[1], new_v[1], -R, self._AI.alpha, self._AI.gamma) ]) if state.terminate(): break s = new_s c_player *= -1 for i in [-1, 0, 1]: if state.win(i): return i
def generate_action(self, state): move = get_winning_move(state, self._player) if len(move) > 0: return move[0][1], move[0][2] move = get_winning_move(state, self._opponent) if len(move) > 0: return move[0][1], move[0][2] env = State(state) actions = [i for i in range(16) if env.valid_action(*decode_action(i))] # random.shuffle(actions) return_action = decode_action(random.choice(actions)) return return_action[0], return_action[1]
def get_action(self, state, player): """play n_playout playouts, choose action greedily on the basis of visits""" for n in range(self._n_playout): # if n % 100 == 0: logger.debug("playout: {}".format(n)) n_state = State(state) self._playout(n_state) # for _id, node in self._root._children.items(): # logger.debug("id = {}, value = {}".format(_id, node._get_value(self._c))) return max(self._root._children.items(), key=lambda child: child[1]._get_value(self._c))[0]
def get_action(self, c_state, player): """get action which minimize opponent's maximum reward Arguments: c_state -- a copy of current state player -- current player Returns: a number in [1, 16] denoting the selected action """ val, action = self._search(State(c_state), player, self._search_depth) return action
def _get_value(self, data): """calculate the value of states Arguments: data -- a list of states series tuple Returns: x -- process the data into a list of (state, player) tuple y -- expected value for each (state, player) tuple """ x, y = [], [] for dat in data: for rotate_time in range(4): states, rewards = [], {1: [], -1: []} s = State() c_player = 1 for (height, row, col) in dat: height, row, col = self._rotate_data(height, row, col, rotate_time) flag, _, r = s.take_action(row, col) rewards[c_player].append(r) rewards[-c_player].append(-r) c_player *= -1 reward_sum = {1: [0 for i in range(len(rewards[1]) + 1)], -1: [0 for i in range(len(rewards[1]) + 1)]} for i in range(len(rewards[1])): reward_sum[1][len(rewards[1]) - i - 1] = reward_sum[1][len(rewards[1]) - i] * self.gamma + \ rewards[1][len(rewards[1]) - i - 1] reward_sum[-1][len(rewards[-1]) - i - 1] = reward_sum[-1][len(rewards[-1]) - i] * self.gamma + \ rewards[-1][len(rewards[-1]) - i - 1] s = State() c_player = 1 ind = 0 for (height, row, col) in dat: height, row, col = self._rotate_data(height, row, col, rotate_time) x.append((np.array(s.get_state()), 1)) y.append(reward_sum[1][ind]) x.append((np.array(s.get_state()), -1)) y.append(reward_sum[-1][ind]) s.take_action(row, col) ind += 1 return x, y
def get_action(self, state, player): """get action, return AI's action if it's AI's turn, read input from stdin otherwise Arguments: state -- current state player -- current player Returns: a (row, col) pair denoting action """ n_state = State(state) if player == self.player: action = self._AI.get_action(n_state) emit_action(action) return action return self.read_action()
def _search(self, c_state, player, depth, max_level=True, alpha=-np.inf, beta=np.inf): """recursively search every possible action and evaluate the leaf nodes Arguments: c_state -- a copy of current state player -- current player depth -- depth left for furthur searching max_level -- whether the level is maximizer alpha -- best value along the path from root to maximizer beta -- worst value along the path from root to minimizer Returns: return_val -- value of this node return_action -- where return_val comes from """ if c_state.terminate(): if c_state.win(player): return 1, None if c_state.win(-player): return -1, None return 0, None if depth == 0: return self._evaluate([c_state.get_state()], [player]), None comp = (lambda a, b: max(a, b)) if max_level else ( lambda a, b: min(a, b)) return_val = -np.inf if max_level else np.inf return_action = -1 actions = [ i for i in range(16) if c_state.valid_action(*decode_action(i)) ] random.shuffle(actions) for i in actions: n_state = State(c_state) r, c = decode_action(i) n_state.take_action(r, c) val, action = self._search(n_state, -player, depth - 1, not max_level, alpha, beta) return_val = comp(return_val, val) if val == return_val: return_action = i alpha, beta, prune = self._alpha_beta_pruning( return_val, max_level, alpha, beta) if prune: break return return_val, return_action
def train(self): """reinforcement training process Arguments: None Returns: None """ percentage = 0 logger.info("[Reinforcement] Start Training") logger.info("[Reinforcement] Training Complete: 0%") for epoch in range(self._n_epoch): state = [State() for _ in range(4)] s = [state[_].get_initial_state() for _ in range(4)] c_player = 1 while True: action = self._get_action(state[0], c_player) flag, new_s, R = zip(*[ state[i].take_action(*(self._rotate(*action, i))) for i in range(4) ]) v = self._evaluate(s + s, [c_player for i in range(4)] + [-c_player for i in range(4)]) new_v = self._evaluate(new_s + new_s, [c_player for i in range(4)] + [-c_player for i in range(4)]) self._update( *self._concat_training_data(s, v, new_v, R, c_player)) self._AI.step(encode_action(action)) self._opponent.step(encode_action(action)) if state[0].terminate(): break s = new_s c_player *= -1 if epoch / self._n_epoch > percentage / 100: percentage = math.ceil(epoch / self._n_epoch * 100) logger.info('[Reinforcement] Training Complete: {}%'.format( percentage)) if percentage % 10 == 0: self._store() self._AI.refresh() self._opponent.refresh() logger.debug('[Reinforcement] Training Complete: 100%') self._store()
def test(self): result = {1: 0, -1: 0} percentage = 0 logger.info("[Test] Testing Complete: 0%") t0 = time.time() for epoch in range(self._n_epoch): state = State() while True: action = self._get_action(state, state.player) flag, _, R = state.take_action(*action) if state.terminate(): break self._step(encode_action(action)) for i in [1, -1]: if state.win(i): result[i] += 1 if epoch / self._n_epoch > percentage / 100: percentage = math.ceil(epoch / self._n_epoch * 100) logger.info("[Test] Testing Complete: {}%".format(percentage)) self._AI.refresh() logger.info("[Test] Testing Complete: 100%") return result[1], result[-1], time.time() - t0
def _get_action(self, state, player): n_state = State(state) if player == self._player: return self._AI.get_action(n_state) return self._bot.generate_action(n_state)