def arena_process(i): g = Game(8) nnet = nn(g) nnet.load_model(filename=("model_auto_" + str(i + 1))) nmcts = MCTS(g, nnet, args) pnet = nn(g) if i != 0: pnet.load_model(filename=("model_auto_" + str(i))) pmcts = MCTS(g, pnet, args) def player1(x): pi = pmcts.get_action_prob(x) # display_pi(np.array(pi[:-1]).reshape((len(x), len(x)))) return np.random.choice(len(pi), p=pi) def player2(x): pi = nmcts.get_action_prob(x) return np.random.choice(len(pi), p=pi) arena = Arena(player1=lambda x: player1(x), player2=lambda x: player2(x), game=g, display=display) return arena.play_games(8)
def arena_process(self, r, old_model_file, new_model_file, verbose=False): old_net = nn(self.game) if len(old_model_file) > 1: old_net.load_model(filename=old_model_file) else: print('random state') # old_net.load_model(filename=old_model_file) old_mcts = MCTS(self.game, old_net, self.args) new_net = nn(self.game) new_net.load_model(filename=new_model_file) new_mcts = MCTS(self.game, new_net, self.args) def old_player(x): pi = old_mcts.get_action_prob(x, self.args['numMCTSSims']) # display_pi(np.array(pi[:-1]).reshape((len(x), len(x)))) return np.random.choice(len(pi), p=pi) def new_player(x): pi = new_mcts.get_action_prob(x, int(self.args['numMCTSSims'] * 1)) return np.random.choice(len(pi), p=pi) arena = Arena(player1=lambda x: old_player(x), player2=lambda x: new_player(x), game=self.game, display=display) return arena.play_games(r, verbose=verbose)
def train(self): ''' ''' for i in range(1, self.args.num_iters + 1): print('----START ITERATION:' + str(i)) #num_episodes回対戦して,experienceを蓄積する. for _ in range(self.args.num_episodes): self.mcts = MCTS(self.env_utils, self.new_model_system, self.args) #MCTS初期化 self.experiences.extend(self.run_episode()) if len(self.experiences) > self.args.max_experiences: self.experiences \ =self.experiences[len(self.experiences)-self.args.max_experiences:] #experienceからランダムサンプリングして,NN用のミニバッチを作る. mini_batch = random.sample( self.experiences, min([len(self.experiences), self.args.mini_batch_size])) #現在のモデルを保存して,cur_model_systemに読み込み. self.new_model_system.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.cur_model_system.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') cur_mcts = MCTS(self.env_utils, self.cur_model_system, self.args) self.new_model_system.fit(mini_batch) new_mcts = MCTS(self.env_utils, self.new_model_system, self.args) #Arenaで対戦させて,既存モデルと比べて強くなったか確認する. print('PITTING AGAINST PREVIOUS VERSION') arena = Arena( lambda x: np.argmax(cur_mcts.get_action_prob(x, temp=0)), lambda x: np.argmax(new_mcts.get_action_prob(x, temp=0)), self.env, display=False) cur_wins, new_wins, draws = arena.play_games( self.args.num_compares_arena) print('NEW/CUR WINS : %d / %d ; DRAWS : %d' % (new_wins, cur_wins, draws)) #update_thresholdよりも強くなっていたら,採用,強くなっていなかったら棄却する. if 0 < cur_wins + new_wins and float(new_wins) / ( cur_wins + new_wins) < self.args.update_threshold: print('REJECTING NEW MODEL') self.new_model_system.load_checkpoint( folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.new_model_system.save_checkpoint( folder=self.args.checkpoint, filename=self.get_checkpoint_filename(i)) self.new_model_system.save_checkpoint( folder=self.args.checkpoint, filename='best.pth.tar')
hp = HumanPlayer(env).play # 人間プレイヤー rp = RandomPlayer(env).play # ランダム gp = GreedyPlayer(env, env_utils).play # グリーディ(常に直後に最も多くの石が取れる行動を選択) agp = AntiGreedyPlayer(env, env_utils).play # 逆グリーディ(グリーディの逆) cgp = CompositeGreedyPlayer(env, env_utils).play # 混合グリーディ(前半逆グリーディで後半グリーディ) # AlphaZeroプレイヤー1の設定 ms1 = ModelSystem(env) ms1.load_checkpoint('temp', 'best.pth.tar') # DNNロード args1 = dotdict({'num_MCTS': 128, 'cpuct': 1.0, 'dirichlet_eps': 0.0}) mcts1 = MCTS(env_utils, ms1, args1) # MCTSはnum_MCTS,dirichlet_epsしか参照しない. azp1 = lambda x: np.argmax(mcts1.get_action_prob(x, temp=0) ) # AlphaZeroプレイヤーオブジェクト # AlphaZeroプレイヤー2の設定(AlphaZero同士で対戦させたいとき用) ms2 = ModelSystem(env) ms2.load_checkpoint('temp', 'best.pth.tar') # DNNロード args2 = dotdict({'num_MCTS': 100, 'cpuct': 1.0, 'dirichlet_eps': 0.0}) mcts2 = MCTS(env_utils, ms2, args2) # MCTSはnum_MCTS,dirichlet_epsしか参照しない. azp2 = lambda x: np.argmax(mcts2.get_action_prob(x, temp=0) ) # AlphaZeroプレイヤーオブジェクト ############################################################################### # 試合設定 arena = Arena(player1=azp1, player2=hp, env=env, display=None) # 対戦開始 print(arena.play_games(2, verbose=True))
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) # eps_time = AverageMeter() # bar = Bar('Self Play', max=self.args.numEps) # end = time.time() for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.execute_episode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.save_train_examples(i - 1) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena( lambda x: np.argmax(pmcts.get_action_prob(x, temp=0)), lambda x: np.argmax(nmcts.get_action_prob(x, temp=0)), self.game) pwins, nwins, draws = arena.play_games( self.args.arenaCompare) # not implemented print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.get_checkpoint_file(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
('CompositeGreedyPlayer',cgp,0), ('mini-AlphaZero',azp1,1)] ############################################################################### num_trial=100 results=np.zeros(shape=(len(players),len(players))) for i in range(len(players)): for j in range(len(players))[0:i+1]: arena=Arena(player1=players[i][1],player2=players[j][1],env=env,display=None) actual_num_trial=2 if (players[i][2]==0 and players[j][2]==0) else num_trial #playerが両方とも決定論的な場合は2回で済ます. one_win,two_win,draw=arena.play_games(actual_num_trial,verbose=False) print(players[i][0],' vs ',players[j][0],' : ',one_win/float(actual_num_trial),' wins.') results[i,j]=one_win/float(actual_num_trial) ############################################################################### import pandas as pd import matplotlib.pyplot as plt import seaborn as sns df=pd.DataFrame(results,index=[x[0] for x in players],columns=[x[0] for x in players]) plt.figure(figsize=(10,10)) sns.heatmap(df,annot=True) plt.show()