def dispose_msg(msg, msg_queue): # print('recv:', msg) global board global s1 global first_query global who_first ans = None seq = msg.split(' ') if seq[0] == 'START:': board_size = int(seq[1]) Board.set_board_size(board_size) board = Board() if s1 is None: s1 = StrategyDNN() first_query = True who_first = None ans = 'START: OK' if msg_queue is not None: msg_queue.put(('start', )) s1.absorb('?') s1.on_episode_start() elif seq[0] == 'MOVE:': assert len(seq) >= 4, 'protocol inconsistent' old_board = copy.deepcopy(board) x, y = int(seq[1]), int(seq[2]) who = Board.STONE_BLACK if int(seq[3]) == 1 else Board.STONE_WHITE if who_first is None: who_first = who print('who first?', who_first) if board.is_legal(x, y): board.move(x, y, who) s1.swallow(who, old_board, board) if msg_queue is not None: msg_queue.put(('move', who, x * Board.BOARD_SIZE + y)) elif seq[0] == 'WIN:': assert len(seq) == 3, 'protocol inconsistent' x, y = int(seq[1]), int(seq[2]) who = board.get(x, y) print('player %d win the game' % (who, )) elif seq[0] == 'UNDO:': ans = 'UNDO: unsupported yet' elif seq[0] == 'WHERE:': if who_first is None: who_first = Board.STONE_BLACK print('who first?', who_first) if first_query: s1.stand_for = board.query_stand_for(who_first) print('i stand for:', s1.stand_for) first_query = False assert s1.stand_for is not None x, y = s1.preferred_move(board) ans = 'HERE: %d %d' % (x, y) elif seq[0] == 'END:': # s1.close() ans = 'END: OK' return ans
def reinforce(self): if len(self.oppo_pool) == 0: self.oppo_pool.append( StrategyDNN(is_train=False, is_revive=True, is_rl=False)) s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True) s2 = random.choice(self.oppo_pool) stat = [] win1, win2, draw = 0, 0, 0 n_lose = 0 iter_n = 100 i = 0 while True: print('iter:', i) for _ in range(1000): s1.stand_for = random.choice( [Board.STONE_BLACK, Board.STONE_WHITE]) s2.stand_for = Board.oppo(s1.stand_for) g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1) g.step_to_end() win1 += 1 if g.winner == s1.stand_for else 0 win2 += 1 if g.winner == s2.stand_for else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 # if win1 > win2: # s1_c = s1.mind_clone() # self.oppo_pool.append(s1_c) # s2 = random.choice(self.oppo_pool) # n_lose = 0 # print('stronger, oppos:', len(self.oppo_pool)) # elif win1 < win2: # n_lose += 1 # # if n_lose >= 50: # break if i % 1 == 0 or i + 1 == iter_n: total = win1 + win2 + draw win1_r = win1 / total win2_r = win2 / total draw_r = draw / total print("iter:%d, win: %.3f, loss: %.3f, tie: %.3f" % (i, win1_r, win2_r, draw_r)) stat.append([win1_r, win2_r, draw_r]) i += 1 if i > iter_n: break stat = np.array(stat) print('stat. shape:', stat.shape) np.savez('/home/splendor/fusor/stat.npz', stat=np.array(stat)) self.strategy_1 = self.strategy_2 = s1
def init_both_sides(self): # feat = Board.BOARD_SIZE_SQ * 2 + 2 # if self.strategy_1 is None: # s1 = StrategyTD(feat, feat * 2) # s1.stand_for = Board.STONE_BLACK # s1.alpha = 0.3 # s1.beta = 0.3 # s1.lambdaa = 0.05 # s1.epsilon = 0.3 # self.strategy_1 = s1 # else: # s1 = self.strategy_1 # s1.epsilon = 0.3 if self.strategy_1 is None: # s1 = StrategyMC() # s1 = StrategyANN(feat, feat * 2) file = tf.train.latest_checkpoint(RL_BRAIN_DIR) s1 = StrategyDNN(from_file=file, part_vars=True) # s1 = StrategyMCTS1() self.strategy_1 = s1 else: s1 = self.strategy_1 s1.is_learning = True s1.stand_for = Board.STONE_BLACK # if self.strategy_2 is None: # s2 = StrategyTD(feat, feat * 2) # s2.stand_for = Board.STONE_WHITE # self.strategy_2 = s2 # else: # s2 = self.strategy_2 # s2.is_learning = False s2 = StrategyRand() # s2 = StrategyMinMax() s2.stand_for = Board.STONE_WHITE self.strategy_2 = s2 return s1, s2
def init_both_sides(self): feat = Board.BOARD_SIZE_SQ * 2 + 2 # if self.strategy_1 is None: # s1 = StrategyTD(feat, feat * 2) # s1.stand_for = Board.STONE_BLACK # # s1.alpha = 0.3 # # s1.beta = 0.3 # s1.lambdaa = 0.05 # s1.epsilon = 0.3 # self.strategy_1 = s1 # else: # s1 = self.strategy_1 # s1.epsilon = 0.3 if self.strategy_1 is None: # s1 = StrategyMC() # s1 = StrategyANN(feat, feat * 2) s1 = StrategyDNN() self.strategy_1 = s1 else: s1 = self.strategy_1 s1.is_learning = True s1.stand_for = Board.STONE_BLACK # if self.strategy_2 is None: # s2 = StrategyTD(feat, feat * 2) # s2.stand_for = Board.STONE_WHITE # self.strategy_2 = s2 # else: # s2 = self.strategy_2 # s2.is_learning = False s2 = StrategyRand() # s2 = StrategyMinMax() s2.stand_for = Board.STONE_WHITE self.strategy_2 = s2 return s1, s2
def reinforce(self, resume=True): self.oppo_pool = self.get_mindsets(RL_BRAIN_DIR, FILE_PREFIX) part_vars = True if resume and len(self.oppo_pool) != 0: file = tf.train.latest_checkpoint(RL_BRAIN_DIR) part_vars = False else: file = tf.train.latest_checkpoint(SL_BRAIN_DIR) part_vars = True s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True, from_file=file, part_vars=part_vars) print('I was born from', file) if len(self.oppo_pool) != 0: file = random.choice(self.oppo_pool) file = os.path.join(RL_BRAIN_DIR, file) part_vars = False else: file = tf.train.latest_checkpoint(SL_BRAIN_DIR) part_vars = True s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=part_vars) print('vs.', file) stat = [] # n_lose = 0 iter_n = 100 for i in range(iter_n): print('iter:', i) win1, win2, draw = 0, 0, 0 step_counter, explo_counter = 0, 0 episodes = cfg.REINFORCE_PERIOD for _ in range(episodes): s1.stand_for = random.choice([Board.STONE_BLACK, Board.STONE_WHITE]) s2.stand_for = Board.oppo(s1.stand_for) g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1) g.step_to_end() win1 += 1 if g.winner == s1.stand_for else 0 win2 += 1 if g.winner == s2.stand_for else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 # print('winner: {:d}, stand for: {:d}'.format(g.winner, s1.stand_for)) s1.win_ratio = win1 / win2 if win2 != 0 else 1. step_counter += g.step_counter explo_counter += g.exploration_counter if s1.win_ratio > 1.1: file = FILE_PREFIX + '-' + str(i) s1.mind_clone(os.path.join(RL_BRAIN_DIR, FILE_PREFIX), i) self.oppo_pool.append(file) file = random.choice(self.oppo_pool) file = os.path.join(RL_BRAIN_DIR, file) s2.close() s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=False) print('vs.', file) # n_lose = 0 # elif win1 < win2: # n_lose += 1 # if n_lose >= 50: # break if i % 1 == 0 or i + 1 == iter_n: total = win1 + win2 + draw win1_r = win1 / total win2_r = win2 / total draw_r = draw / total print("iter:%d, win: %.3f, lose: %.3f, draw: %.3f, t: %.3f" % (i, win1_r, win2_r, draw_r, s1.temperature)) stat.append([win1_r, win2_r, draw_r]) print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes)) if i % 10 == 0 or i + 1 == iter_n: np.savez(STAT_FILE, stat=np.array(stat)) print('rl done. you can try it.') self.strategy_1 = self.strategy_2 = s1