def match(self): s1, s2 = self.strategy_1, self.strategy_2 print('player1:', s1.__class__.__name__) print('player2:', s2.__class__.__name__) probs = np.zeros(6) games = 100 # 30 for i in range(games): print(i) s1.stand_for = Board.STONE_BLACK s2.stand_for = Board.STONE_WHITE g = Game(Board.rand_generate_a_position(), s1, s2) g.step_to_end() if g.winner == Board.STONE_BLACK: probs[0] += 1 elif g.winner == Board.STONE_WHITE: probs[1] += 1 else: probs[2] += 1 s1.stand_for = Board.STONE_WHITE s2.stand_for = Board.STONE_BLACK g = Game(Board.rand_generate_a_position(), s1, s2) g.step_to_end() if g.winner == Board.STONE_WHITE: probs[3] += 1 elif g.winner == Board.STONE_BLACK: probs[4] += 1 else: probs[5] += 1 print('total play:', games) print(probs)
def self_play(self): # generate data with cur_best self.load_from_vat(self.cur_best_dir) for _ in range(N_GAMES_TRAIN): board = Board() assert (board.stones == Board.STONE_EMPTY).any() memo_s = [] memo_pi = [] winner = Board.STONE_EMPTY step = 0 whose_persp = board.whose_turn_now() cur_player = whose_persp while True: self._mcts.sim_many(board, N_SIMS) t = 1 if step < N_STEPS_EXPLORE else 1e-9 step += 1 pi, move = self._mcts.get_pi_and_best_move(t) memo_s.append(board) memo_pi.append(pi) new_board = copy.deepcopy(board) new_board.place_down(move, cur_player) over, winner, _ = new_board.is_over(board) if over: break if self.resign(board, pi): break board = new_board if winner != Board.STONE_EMPTY: reward = winner == whose_persp memo_z = [0] * len(memo_s) memo_z[-1::-2] = reward memo_z[-1::-2] = -reward self.memo(memo_s, memo_pi, memo_z)
def reinforce(self): if len(self.oppo_pool) == 0: self.oppo_pool.append( StrategyDNN(is_train=False, is_revive=True, is_rl=False)) s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True) s2 = random.choice(self.oppo_pool) stat = [] win1, win2, draw = 0, 0, 0 n_lose = 0 iter_n = 100 i = 0 while True: print('iter:', i) for _ in range(1000): s1.stand_for = random.choice( [Board.STONE_BLACK, Board.STONE_WHITE]) s2.stand_for = Board.oppo(s1.stand_for) g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1) g.step_to_end() win1 += 1 if g.winner == s1.stand_for else 0 win2 += 1 if g.winner == s2.stand_for else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 # if win1 > win2: # s1_c = s1.mind_clone() # self.oppo_pool.append(s1_c) # s2 = random.choice(self.oppo_pool) # n_lose = 0 # print('stronger, oppos:', len(self.oppo_pool)) # elif win1 < win2: # n_lose += 1 # # if n_lose >= 50: # break if i % 1 == 0 or i + 1 == iter_n: total = win1 + win2 + draw win1_r = win1 / total win2_r = win2 / total draw_r = draw / total print("iter:%d, win: %.3f, loss: %.3f, tie: %.3f" % (i, win1_r, win2_r, draw_r)) stat.append([win1_r, win2_r, draw_r]) i += 1 if i > iter_n: break stat = np.array(stat) print('stat. shape:', stat.shape) np.savez('/home/splendor/fusor/stat.npz', stat=np.array(stat)) self.strategy_1 = self.strategy_2 = s1
def measure_perf(self, s1, s2): old_epsilon1, old_is_learning1, old_stand_for1 = s1.epsilon, s1.is_learning, s1.stand_for # old_epsilon2, old_is_learning2, old_stand_for2 = s2.epsilon, s2.is_learning, s2.stand_for old_is_learning2, old_stand_for2 = s2.is_learning, s2.stand_for s1.epsilon, s1.is_learning, s1.stand_for = 0, False, Board.STONE_BLACK # s2.epsilon, s2.is_learning, s2.stand_for = 0, False, Board.STONE_WHITE s2.is_learning, s2.stand_for = False, Board.STONE_WHITE s3 = StrategyRand() probs = [0, 0, 0, 0, 0, 0] games = 3 # 30 for i in range(games): # the learner s1 move first(use black) s1.stand_for = Board.STONE_BLACK s2.stand_for = Board.STONE_WHITE g = Game(Board(), s1, s2) g.step_to_end() if g.winner == Board.STONE_BLACK: probs[0] += 1 elif g.winner == Board.STONE_EMPTY: probs[1] += 1 # the learner s1 move second(use white) s1.stand_for = Board.STONE_WHITE s2.stand_for = Board.STONE_BLACK g = Game(Board(), s1, s2) g.step_to_end() if g.winner == Board.STONE_WHITE: probs[2] += 1 elif g.winner == Board.STONE_EMPTY: probs[3] += 1 # the learner s1 move first vs. random opponent s1.stand_for = Board.STONE_BLACK s3.stand_for = Board.STONE_WHITE g = Game(Board(), s1, s3) g.step_to_end() if g.winner == Board.STONE_BLACK: probs[4] += 1 # the learner s1 move second vs. random opponent s1.stand_for = Board.STONE_WHITE s3.stand_for = Board.STONE_BLACK g = Game(Board(), s1, s3) g.step_to_end() if g.winner == Board.STONE_WHITE: probs[5] += 1 probs = [i / games for i in probs] print(probs) s1.epsilon, s1.is_learning, s1.stand_for = old_epsilon1, old_is_learning1, old_stand_for1 # s2.epsilon, s2.is_learning, s2.stand_for = old_epsilon2, old_is_learning2, old_stand_for2 s2.is_learning, s2.stand_for = old_is_learning2, old_stand_for2 return probs
def dispose_msg(msg, msg_queue): # print('recv:', msg) global board global s1 global first_query global who_first ans = None seq = msg.split(' ') if seq[0] == 'START:': board_size = int(seq[1]) Board.set_board_size(board_size) board = Board() if s1 is None: s1 = StrategyDNN() first_query = True who_first = None ans = 'START: OK' if msg_queue is not None: msg_queue.put(('start', )) # s1.absorb('?') s1.on_episode_start() elif seq[0] == 'MOVE:': assert len(seq) >= 4, 'protocol inconsistent' old_board = copy.deepcopy(board) x, y = int(seq[1]), int(seq[2]) who = Board.STONE_BLACK if int(seq[3]) == 1 else Board.STONE_WHITE if who_first is None: who_first = who print('who first?', who_first) if board.is_legal(x, y): board.move(x, y, who) s1.swallow(who, old_board, board) if msg_queue is not None: msg_queue.put(('move', who, x * Board.BOARD_SIZE + y)) elif seq[0] == 'WIN:': assert len(seq) == 3, 'protocol inconsistent' x, y = int(seq[1]), int(seq[2]) who = board.get(x, y) print('player %d win the game' % (who, )) elif seq[0] == 'UNDO:': ans = 'UNDO: unsupported yet' elif seq[0] == 'WHERE:': if who_first is None: who_first = Board.STONE_BLACK print('who first?', who_first) if first_query: s1.stand_for = board.query_stand_for(who_first) print('i stand for:', s1.stand_for) first_query = False assert s1.stand_for is not None x, y = s1.preferred_move(board) ans = 'HERE: %d %d' % (x, y) elif seq[0] == 'END:': # s1.close() ans = 'END: OK' return ans
def __init__(self): self.cur_board = Board() self.cur_player = self.cur_board.whose_turn_now() self.is_over = False self.winner = None self.history_states = [] self.history_actions = [] self.reward = 0. self.num_of_moves = 0 self.rl_stard_for = Board.STONE_EMPTY self.first_rl_step = None
def inference_who_won(self): assert len(self.observation) > 0 last = self.observation[-1] who, st1 = last[0], last[2] oppo = Board.oppo(who) oppo_will_win = Board.find_pattern_will_win(st1, oppo) if oppo_will_win: return oppo return Board.STONE_EMPTY
def preferred_board(self, old, moves, context): game = context self.searcher.board = old.stones.reshape((-1, Board.BOARD_SIZE)).tolist() DEPTH = 1 score, row, col = self.searcher.search(game.whose_turn, DEPTH) # print('score%d, loc(%d, %d)'%(score, row, col)) x = old.stones.copy() x[row * Board.BOARD_SIZE + col] = game.whose_turn b = Board() b.stones = x return b
def from_new_start_point(self, winner, s1, s2): ''' Returns: ------------ s1 : Strategy the learner s2 : Strategy the teacher ''' if s1 == winner: s2 = s1.mind_clone() if s2 == winner: s1 = s2.mind_clone() # way 1: s1 follow the winner's stand-for s1.stand_for = winner.stand_for # way 2: s1 switch to another stand-for of winner # s1.stand_for = Board.oppo(winner.stand_for) # way 3: s1 random select stand-for # s1.stand_for = np.random.choice(np.array([Board.STONE_BLACK, Board.STONE_WHITE])) s2.stand_for = Board.oppo(s1.stand_for) s1.is_learning = True s2.is_learning = False return s1, s2
def step(self): moves, self.whose_turn, _ = Game.possible_moves(self.board) strat = self.strat1 if self.whose_turn == self.strat1.stand_for else self.strat2 # print('who', strat.stand_for) strat.update(self.board, None) new_board = strat.preferred_board(self.board, moves, self) # print('who%d play at %s' % (self.whose_turn, # str(divmod(Board.change(self.board, new_board), Board.BOARD_SIZE)))) # print(self.board.stones) if new_board.exploration: strat.setup() self.exploration_counter += 1 self.over, self.winner, self.last_loc = new_board.is_over(self.board) if self.observer is not None: self.observer.swallow(self.whose_turn, self.board, new_board) if self.over: strat.update_at_end(self.board, new_board) opponent_strat = self.strat1 if self.whose_turn != self.strat1.stand_for else self.strat2 opponent_strat.update_at_end(None, new_board) if self.observer is not None: self.observer.absorb(self.whose_turn) self.board = new_board if self.strat1 == self.strat2: self.strat1.stand_for = Board.oppo(self.strat1.stand_for)
def setup_brain(self): if self.policy1 is None: self.policy1 = Brain(self.transformer.get_input_shape, self.transformer.placeholder_inputs, self.transformer.model, RLPolicy.SL_POLICY_DIR, RLPolicy.SL_SUMMARY_DIR) assert self.policy1 is not None if self.policy2 is not None: self.policy2.close() self.policy2 = None # random choice from oppo_pool policy_dir = RLPolicy.SL_POLICY_DIR summary_dir = RLPolicy.SL_SUMMARY_DIR if self.oppo_brain: rl_brain_id = random.choice(tuple(self.oppo_brain.keys())) print('the chosen oppo:', rl_brain_id) policy_dir = self.oppo_brain[rl_brain_id] # summary_dir = self.oppo_summary.get(rl_brain_id, RLPolicy.RL_SUMMARY_DIR_PREFIX + str(0)) # summary_dir = os.path.join(RLPolicy.WORK_DIR, summary_dir) self.policy2 = Brain(self.transformer.get_input_shape, self.transformer.placeholder_inputs, self.transformer.model, policy_dir, summary_dir) assert self.policy2 is not None self.policy1_stand_for = random.choice([Board.STONE_BLACK, Board.STONE_WHITE]) self.policy2_stand_for = Board.oppo(self.policy1_stand_for)
def vs_human(self, which_side_human_play): strategy = self.which_one(Board.oppo(which_side_human_play)) if strategy is None or isinstance(strategy, StrategyRand): strategy = self.which_one(which_side_human_play) if strategy is None: print('without opponent') return old_is_learning, old_stand_for = strategy.is_learning, strategy.stand_for strategy.is_learning, strategy.stand_for = False, Board.oppo(which_side_human_play) s1 = strategy s2 = StrategyHuman() s2.stand_for = which_side_human_play self.game = Game(Board(), s1, s2, self.msg_queue) self.game.step_to_end() strategy.is_learning, strategy.stand_for = old_is_learning, old_stand_for
def possible_moves(board): ''' Returns: -------------- boards: Board list ''' # whose turn is it? who = board.whose_turn_now() # print("it is [%d]'s turn" % who) boards = [] loc = np.where(board.stones == 0) # print(loc) for i in loc[0]: x = board.stones.copy() x[i] = who b = Board() b.stones = x boards.append(b) # print('possible moves[%d]' % len(boards)) return boards, who, loc[0]
class Game(object): def __init__(self): self.cur_board = Board() self.cur_player = self.cur_board.whose_turn_now() self.is_over = False self.winner = None self.history_states = [] self.history_actions = [] self.reward = 0. self.num_of_moves = 0 self.rl_stard_for = Board.STONE_EMPTY self.first_rl_step = None def move(self, loc): old_board = copy.deepcopy(self.cur_board) self.cur_board.move(loc[0], loc[1], self.cur_player) self.cur_player = Board.oppo(self.cur_player) self.is_over, self.winner, _ = self.cur_board.is_over(old_board) self.num_of_moves += 1 def record_history(self, state, action): self.history_states.append(state) self.history_actions.append((self.cur_player, action)) def remember_1st_rl_step(self, state): assert state is not None if self.first_rl_step is None: self.first_rl_step = (state, self.cur_player) def calc_reward(self, stand_for): assert self.is_over if self.winner == 0: self.reward = 0 elif self.winner == stand_for: self.reward = 1 else: self.reward = -1
class FiveGame(TwoPlayerGame): def __init__(self): self.reset() def reset(self): TwoPlayerGame.reset(self) self.movesDone = 0 self.b = Board() def isLegal(self, c, pos): return self.b.is_legal(pos[0], pos[1]) def _fiveRow(self, c, pos): b = self.b.stones.reshape(-1, Board.BOARD_SIZE) self.b.find_conn_5(b, pos[0], pos[1], c) def getLegals(self, c): loc = np.where(self.b.stones == 0) moves = [i for i in map(lambda i: divmod(i, Board.BOARD_SIZE), loc[0])] return moves def doMove(self, c, pos): """ the action is a (color, position) tuple, for the next stone to move. returns True if the move was legal. """ self.movesDone += 1 if not self.isLegal(c, pos): return False elif self._fiveRow(c, pos): self.winner = c self.b.move(pos[0], pos[1], c) return True else: self.b.move(pos[0], pos[1], c) if self.movesDone == Board.BOARD_SIZE_SQ: self.winner = Board.STONE_EMPTY return True def playToTheEnd(self, p1, p2): """ alternate playing moves between players until the game is over. """ assert p1.color == -p2.color i = 0 p1.game = self p2.game = self players = [p1, p2] while not self.gameOver(): p = players[i] self.performAction(p.getAction()) i = (i + 1) % 2
def learn_from_2_teachers(self): s1 = StrategyMinMax() s1.stand_for = Board.STONE_BLACK self.strategy_1 = s1 s2 = StrategyMinMax() s2.stand_for = Board.STONE_WHITE self.strategy_2 = s2 observer = StrategyMC() win1, win2, draw = 0, 0, 0 step_counter, explo_counter = 0, 0 begin = datetime.datetime.now() episodes = 10000 for i in range(episodes): g = Game(Board(), s1, s2, observer=observer) g.step_to_end() win1 += 1 if g.winner == Board.STONE_BLACK else 0 win2 += 1 if g.winner == Board.STONE_WHITE else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 step_counter += g.step_counter explo_counter += g.exploration_counter print('training...%d' % i) total = win1 + win2 + draw print("black win: %f" % (win1 / total)) print("white win: %f" % (win2 / total)) print("draw: %f" % (draw / total)) print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes)) end = datetime.datetime.now() diff = end - begin print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes)) observer.save('./brain1.npz')
def train1(self, s1, s2): '''train one time Returns: ------------ winner : Strategy the win strategy ''' max_explore_rate = 0.95 win1, win2, draw = 0, 0, 0 step_counter, explo_counter = 0, 0 begin = datetime.datetime.now() episodes = 1 samples = 100 interval = episodes // samples perf = [[] for _ in range(7)] learner = s1 if s1.is_learning else s2 oppo = self.which_one(Board.oppo(learner.stand_for)) stat_win = [] # past_me = learner.mind_clone() for i in range(episodes): # if (i + 1) % interval == 0: # # print(np.allclose(s1.hidden_weights, past_me.hidden_weights)) # probs = self.measure_perf(learner, oppo) # perf[0].append(i) # for idx, x in enumerate(probs): # perf[idx + 1].append(x) learner.epsilon = max_explore_rate * np.exp(-5 * i / episodes) # * (1 if i < episodes//2 else 0.3) # g = Game(Board(), s1, s2) g.step_to_end() win1 += 1 if g.winner == Board.STONE_BLACK else 0 win2 += 1 if g.winner == Board.STONE_WHITE else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 stat_win.append(win1 - win2 - draw) # rec.append(win1) step_counter += g.step_counter explo_counter += g.exploration_counter # print('steps[%d], explos[%d]' % (g.step_counter, g.exploration_counter)) print('training...%d' % i) total = win1 + win2 + draw print("black win: %f" % (win1 / total)) print("white win: %f" % (win2 / total)) print("draw: %f" % (draw / total)) print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes)) end = datetime.datetime.now() diff = end - begin print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes)) with open('stat-result-win.txt', 'w') as f: f.write(repr(stat_win)) # print(perf) # self.draw_perf(perf) # np.set_printoptions(threshold=np.nan, formatter={'float_kind' : lambda x: "%.4f" % x}) # with open('stat-result-net-train-errors.txt', 'w') as f: # f.write(repr(np.array(s1.errors))) winner = Board.STONE_BLACK if win1 >= win2 else Board.STONE_WHITE return self.which_one(winner), max(win1, win2) / total
self.mcts.update_with_move(best_move) return m raise Exception('impossible') def _value_fn(self, board): state, _ = self.get_input_values(board.stones) v = self.brain.get_state_value(state) return v def _policy_fn(self, board): _, _, legal_moves = Game.possible_moves(board) state, _ = self.get_input_values(board.stones) probs = self.brain.get_move_probs(state) probs = probs[0, legal_moves] return list(zip(legal_moves, probs)) def _rollout_fn(self, board, legal_moves): state, _ = self.get_input_values(board.stones) probs = self.brain.get_move_probs(state) return probs def get_input_values(self, board): state, _ = self.brain.adapt_state(board) legal = (board == Board.STONE_EMPTY) return state, legal if __name__ == '__main__': mcts = StrategyMCTS1() board = Board() mcts.preferred_board(board, None, None)
def test_sim_many(): zero = AG0(input_fn, model_fn) zero.prepare() s0 = Board.rand_generate_a_position() zero._mcts.sim_many(s0, N_SIMS)
def move(self, loc): old_board = copy.deepcopy(self.cur_board) self.cur_board.move(loc[0], loc[1], self.cur_player) self.cur_player = Board.oppo(self.cur_player) self.is_over, self.winner, _ = self.cur_board.is_over(old_board) self.num_of_moves += 1
def reset(self): TwoPlayerGame.reset(self) self.movesDone = 0 self.b = Board()
def reinforce(self, resume=True): self.oppo_pool = self.get_mindsets(RL_BRAIN_DIR, FILE_PREFIX) part_vars = True if resume and len(self.oppo_pool) != 0: file = tf.train.latest_checkpoint(RL_BRAIN_DIR) part_vars = False else: file = tf.train.latest_checkpoint(SL_BRAIN_DIR) part_vars = True s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True, from_file=file, part_vars=part_vars) print('I was born from', file) if len(self.oppo_pool) != 0: file = random.choice(self.oppo_pool) file = os.path.join(RL_BRAIN_DIR, file) part_vars = False else: file = tf.train.latest_checkpoint(SL_BRAIN_DIR) part_vars = True s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=part_vars) print('vs.', file) stat = [] # n_lose = 0 iter_n = 100 for i in range(iter_n): print('iter:', i) win1, win2, draw = 0, 0, 0 step_counter, explo_counter = 0, 0 episodes = cfg.REINFORCE_PERIOD for _ in range(episodes): s1.stand_for = random.choice([Board.STONE_BLACK, Board.STONE_WHITE]) s2.stand_for = Board.oppo(s1.stand_for) g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1) g.step_to_end() win1 += 1 if g.winner == s1.stand_for else 0 win2 += 1 if g.winner == s2.stand_for else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 # print('winner: {:d}, stand for: {:d}'.format(g.winner, s1.stand_for)) s1.win_ratio = win1 / win2 if win2 != 0 else 1. step_counter += g.step_counter explo_counter += g.exploration_counter if s1.win_ratio > 1.1: file = FILE_PREFIX + '-' + str(i) s1.mind_clone(os.path.join(RL_BRAIN_DIR, FILE_PREFIX), i) self.oppo_pool.append(file) file = random.choice(self.oppo_pool) file = os.path.join(RL_BRAIN_DIR, file) s2.close() s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=False) print('vs.', file) # n_lose = 0 # elif win1 < win2: # n_lose += 1 # if n_lose >= 50: # break if i % 1 == 0 or i + 1 == iter_n: total = win1 + win2 + draw win1_r = win1 / total win2_r = win2 / total draw_r = draw / total print("iter:%d, win: %.3f, lose: %.3f, draw: %.3f, t: %.3f" % (i, win1_r, win2_r, draw_r, s1.temperature)) stat.append([win1_r, win2_r, draw_r]) print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes)) if i % 10 == 0 or i + 1 == iter_n: np.savez(STAT_FILE, stat=np.array(stat)) print('rl done. you can try it.') self.strategy_1 = self.strategy_2 = s1