def match(self): s1, s2 = self.strategy_1, self.strategy_2 print('player1:', s1.__class__.__name__) print('player2:', s2.__class__.__name__) probs = np.zeros(6) games = 100 # 30 for i in range(games): print(i) s1.stand_for = Board.STONE_BLACK s2.stand_for = Board.STONE_WHITE g = Game(Board.rand_generate_a_position(), s1, s2) g.step_to_end() if g.winner == Board.STONE_BLACK: probs[0] += 1 elif g.winner == Board.STONE_WHITE: probs[1] += 1 else: probs[2] += 1 s1.stand_for = Board.STONE_WHITE s2.stand_for = Board.STONE_BLACK g = Game(Board.rand_generate_a_position(), s1, s2) g.step_to_end() if g.winner == Board.STONE_WHITE: probs[3] += 1 elif g.winner == Board.STONE_BLACK: probs[4] += 1 else: probs[5] += 1 print('total play:', games) print(probs)
def measure_perf(self, s1, s2): old_epsilon1, old_is_learning1, old_stand_for1 = s1.epsilon, s1.is_learning, s1.stand_for # old_epsilon2, old_is_learning2, old_stand_for2 = s2.epsilon, s2.is_learning, s2.stand_for old_is_learning2, old_stand_for2 = s2.is_learning, s2.stand_for s1.epsilon, s1.is_learning, s1.stand_for = 0, False, Board.STONE_BLACK # s2.epsilon, s2.is_learning, s2.stand_for = 0, False, Board.STONE_WHITE s2.is_learning, s2.stand_for = False, Board.STONE_WHITE s3 = StrategyRand() probs = [0, 0, 0, 0, 0, 0] games = 3 # 30 for i in range(games): # the learner s1 move first(use black) s1.stand_for = Board.STONE_BLACK s2.stand_for = Board.STONE_WHITE g = Game(Board(), s1, s2) g.step_to_end() if g.winner == Board.STONE_BLACK: probs[0] += 1 elif g.winner == Board.STONE_EMPTY: probs[1] += 1 # the learner s1 move second(use white) s1.stand_for = Board.STONE_WHITE s2.stand_for = Board.STONE_BLACK g = Game(Board(), s1, s2) g.step_to_end() if g.winner == Board.STONE_WHITE: probs[2] += 1 elif g.winner == Board.STONE_EMPTY: probs[3] += 1 # the learner s1 move first vs. random opponent s1.stand_for = Board.STONE_BLACK s3.stand_for = Board.STONE_WHITE g = Game(Board(), s1, s3) g.step_to_end() if g.winner == Board.STONE_BLACK: probs[4] += 1 # the learner s1 move second vs. random opponent s1.stand_for = Board.STONE_WHITE s3.stand_for = Board.STONE_BLACK g = Game(Board(), s1, s3) g.step_to_end() if g.winner == Board.STONE_WHITE: probs[5] += 1 probs = [i / games for i in probs] print(probs) s1.epsilon, s1.is_learning, s1.stand_for = old_epsilon1, old_is_learning1, old_stand_for1 # s2.epsilon, s2.is_learning, s2.stand_for = old_epsilon2, old_is_learning2, old_stand_for2 s2.is_learning, s2.stand_for = old_is_learning2, old_stand_for2 return probs
def reinforce(self): if len(self.oppo_pool) == 0: self.oppo_pool.append( StrategyDNN(is_train=False, is_revive=True, is_rl=False)) s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True) s2 = random.choice(self.oppo_pool) stat = [] win1, win2, draw = 0, 0, 0 n_lose = 0 iter_n = 100 i = 0 while True: print('iter:', i) for _ in range(1000): s1.stand_for = random.choice( [Board.STONE_BLACK, Board.STONE_WHITE]) s2.stand_for = Board.oppo(s1.stand_for) g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1) g.step_to_end() win1 += 1 if g.winner == s1.stand_for else 0 win2 += 1 if g.winner == s2.stand_for else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 # if win1 > win2: # s1_c = s1.mind_clone() # self.oppo_pool.append(s1_c) # s2 = random.choice(self.oppo_pool) # n_lose = 0 # print('stronger, oppos:', len(self.oppo_pool)) # elif win1 < win2: # n_lose += 1 # # if n_lose >= 50: # break if i % 1 == 0 or i + 1 == iter_n: total = win1 + win2 + draw win1_r = win1 / total win2_r = win2 / total draw_r = draw / total print("iter:%d, win: %.3f, loss: %.3f, tie: %.3f" % (i, win1_r, win2_r, draw_r)) stat.append([win1_r, win2_r, draw_r]) i += 1 if i > iter_n: break stat = np.array(stat) print('stat. shape:', stat.shape) np.savez('/home/splendor/fusor/stat.npz', stat=np.array(stat)) self.strategy_1 = self.strategy_2 = s1
def vs_human(self, which_side_human_play): strategy = self.which_one(Board.oppo(which_side_human_play)) if strategy is None or isinstance(strategy, StrategyRand): strategy = self.which_one(which_side_human_play) if strategy is None: print('without opponent') return old_is_learning, old_stand_for = strategy.is_learning, strategy.stand_for strategy.is_learning, strategy.stand_for = False, Board.oppo(which_side_human_play) s1 = strategy s2 = StrategyHuman() s2.stand_for = which_side_human_play self.game = Game(Board(), s1, s2, self.msg_queue) self.game.step_to_end() strategy.is_learning, strategy.stand_for = old_is_learning, old_stand_for
def learn_from_2_teachers(self): s1 = StrategyMinMax() s1.stand_for = Board.STONE_BLACK self.strategy_1 = s1 s2 = StrategyMinMax() s2.stand_for = Board.STONE_WHITE self.strategy_2 = s2 observer = StrategyMC() win1, win2, draw = 0, 0, 0 step_counter, explo_counter = 0, 0 begin = datetime.datetime.now() episodes = 10000 for i in range(episodes): g = Game(Board(), s1, s2, observer=observer) g.step_to_end() win1 += 1 if g.winner == Board.STONE_BLACK else 0 win2 += 1 if g.winner == Board.STONE_WHITE else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 step_counter += g.step_counter explo_counter += g.exploration_counter print('training...%d' % i) total = win1 + win2 + draw print("black win: %f" % (win1 / total)) print("white win: %f" % (win2 / total)) print("draw: %f" % (draw / total)) print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes)) end = datetime.datetime.now() diff = end - begin print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes)) observer.save('./brain1.npz')
def train1(self, s1, s2): '''train one time Returns: ------------ winner : Strategy the win strategy ''' max_explore_rate = 0.95 win1, win2, draw = 0, 0, 0 step_counter, explo_counter = 0, 0 begin = datetime.datetime.now() episodes = 1 samples = 100 interval = episodes // samples perf = [[] for _ in range(7)] learner = s1 if s1.is_learning else s2 oppo = self.which_one(Board.oppo(learner.stand_for)) stat_win = [] # past_me = learner.mind_clone() for i in range(episodes): # if (i + 1) % interval == 0: # # print(np.allclose(s1.hidden_weights, past_me.hidden_weights)) # probs = self.measure_perf(learner, oppo) # perf[0].append(i) # for idx, x in enumerate(probs): # perf[idx + 1].append(x) learner.epsilon = max_explore_rate * np.exp(-5 * i / episodes) # * (1 if i < episodes//2 else 0.3) # g = Game(Board(), s1, s2) g.step_to_end() win1 += 1 if g.winner == Board.STONE_BLACK else 0 win2 += 1 if g.winner == Board.STONE_WHITE else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 stat_win.append(win1 - win2 - draw) # rec.append(win1) step_counter += g.step_counter explo_counter += g.exploration_counter # print('steps[%d], explos[%d]' % (g.step_counter, g.exploration_counter)) print('training...%d' % i) total = win1 + win2 + draw print("black win: %f" % (win1 / total)) print("white win: %f" % (win2 / total)) print("draw: %f" % (draw / total)) print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes)) end = datetime.datetime.now() diff = end - begin print("time cost[%f]s, avg.[%f]s" % (diff.total_seconds(), diff.total_seconds() / episodes)) with open('stat-result-win.txt', 'w') as f: f.write(repr(stat_win)) # print(perf) # self.draw_perf(perf) # np.set_printoptions(threshold=np.nan, formatter={'float_kind' : lambda x: "%.4f" % x}) # with open('stat-result-net-train-errors.txt', 'w') as f: # f.write(repr(np.array(s1.errors))) winner = Board.STONE_BLACK if win1 >= win2 else Board.STONE_WHITE return self.which_one(winner), max(win1, win2) / total
def reinforce(self, resume=True): self.oppo_pool = self.get_mindsets(RL_BRAIN_DIR, FILE_PREFIX) part_vars = True if resume and len(self.oppo_pool) != 0: file = tf.train.latest_checkpoint(RL_BRAIN_DIR) part_vars = False else: file = tf.train.latest_checkpoint(SL_BRAIN_DIR) part_vars = True s1 = StrategyDNN(is_train=False, is_revive=True, is_rl=True, from_file=file, part_vars=part_vars) print('I was born from', file) if len(self.oppo_pool) != 0: file = random.choice(self.oppo_pool) file = os.path.join(RL_BRAIN_DIR, file) part_vars = False else: file = tf.train.latest_checkpoint(SL_BRAIN_DIR) part_vars = True s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=part_vars) print('vs.', file) stat = [] # n_lose = 0 iter_n = 100 for i in range(iter_n): print('iter:', i) win1, win2, draw = 0, 0, 0 step_counter, explo_counter = 0, 0 episodes = cfg.REINFORCE_PERIOD for _ in range(episodes): s1.stand_for = random.choice([Board.STONE_BLACK, Board.STONE_WHITE]) s2.stand_for = Board.oppo(s1.stand_for) g = Game(Board.rand_generate_a_position(), s1, s2, observer=s1) g.step_to_end() win1 += 1 if g.winner == s1.stand_for else 0 win2 += 1 if g.winner == s2.stand_for else 0 draw += 1 if g.winner == Board.STONE_EMPTY else 0 # print('winner: {:d}, stand for: {:d}'.format(g.winner, s1.stand_for)) s1.win_ratio = win1 / win2 if win2 != 0 else 1. step_counter += g.step_counter explo_counter += g.exploration_counter if s1.win_ratio > 1.1: file = FILE_PREFIX + '-' + str(i) s1.mind_clone(os.path.join(RL_BRAIN_DIR, FILE_PREFIX), i) self.oppo_pool.append(file) file = random.choice(self.oppo_pool) file = os.path.join(RL_BRAIN_DIR, file) s2.close() s2 = StrategyDNN(is_train=False, is_revive=True, is_rl=False, from_file=file, part_vars=False) print('vs.', file) # n_lose = 0 # elif win1 < win2: # n_lose += 1 # if n_lose >= 50: # break if i % 1 == 0 or i + 1 == iter_n: total = win1 + win2 + draw win1_r = win1 / total win2_r = win2 / total draw_r = draw / total print("iter:%d, win: %.3f, lose: %.3f, draw: %.3f, t: %.3f" % (i, win1_r, win2_r, draw_r, s1.temperature)) stat.append([win1_r, win2_r, draw_r]) print('avg. steps[%f], avg. explos[%f]' % (step_counter / episodes, explo_counter / episodes)) if i % 10 == 0 or i + 1 == iter_n: np.savez(STAT_FILE, stat=np.array(stat)) print('rl done. you can try it.') self.strategy_1 = self.strategy_2 = s1