def get_act_afterstates(self, states): tmp_game = Geister2() max_num = self.bttl_num mat = np.zeros((len(states), max_num)) for num in range(max_num): for act_i in range(len(states)): # tmp_gameに位置情報と自分の駒の色を設定 num_red = 0 # 敵の赤駒の数 is_vld = [] # 敵の盤上にある駒のi for i in range(16): tmp_game.units[i].x = self._game.units[i].x tmp_game.units[i].y = self._game.units[i].y tmp_game.units[i].taken = self._game.units[i].taken tmp_game.units[i].color = self._game.units[i].color if (i >= 8): tmp_game.units[i].color = 1 # 敵の青駒として設定 if (tmp_game.units[i].taken is False): is_vld.append(i) # 敵の赤駒ならばnum_redを1追加 num_red += 1 if tmp_game.units[i].color == 3 else 0 # 敵の駒の色は推定せず,ランダムに self._rnd.shuffle(is_vld) for i in is_vld[:num_red]: tmp_game.units[i].color = 3 # 敵の赤駒として設定 tmp_game.on_action_number_received(act_i) mat[act_i, num] = battle_from(self.policy, self.policy, tmp_game=tmp_game) means = mat.mean(axis=1) return np.argmax(means)
def test(): seed = 2 game = Geister2() tdagent = MCAgent(game, seed) tdagent.w = np.array([ 0.9, 0, 0, 0, 0, 0, 0.8, 0, 0, 0, 0, 0, 0.7, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) rndagent = RandomAgent(game, seed) agents = (tdagent, rndagent) arr0, arr1 = (agent.init_red() for agent in agents) game.setRed(arr0) game.changeSide() game.setRed(arr1) game.changeSide() game.printBoard() player = 0 while not game.is_ended(): agent = agents[player] states = game.after_states() i_act = agent.get_act_afterstates(states) game.on_action_number_received(i_act) if player == 0: game.printBoard() game.changeSide() player = (player+1) % 2
def battle2(): seed = 301 bttl_num = 50 game = Geister2() agents = [[MCAgent(game, seed + i) for i in range(8)], [MCAgent(game, seed + i + 8) for i in range(8)]] agents_str = [ "weights_2/td_learned2_" + str(i) + ".npy" for i in range(1, 9) ] for agent, string in zip(agents[0], agents_str): agent.w = load(string) agents_str = [ "weights_3/td_learned3_" + str(i) + ".npy" for i in range(1, 9) ] for agent, string in zip(agents[1], agents_str): agent.w = load(string) means = np.zeros(8 * 8).reshape(8, 8) for i in range(8): for j in range(8): r_list = np.zeros(bttl_num) for t in range(bttl_num): agent_s = (agents[0][i], agents[1][j]) arr0, arr1 = (agent.init_red() for agent in agent_s) game.__init__() game.setRed(arr0) game.changeSide() game.setRed(arr1) game.changeSide() player = 0 while not game.is_ended(): agent = agent_s[player] states = game.after_states() i_act = agent.get_act_afterstates(states) game.on_action_number_received(i_act) game.changeSide() player = (player + 1) % 2 if player == 1: game.changeSide() result = game.checkResult() r = (1 if (result > 0) else (-1 if (result < 0) else 0)) r_list[t] = r means[i][j] = r_list.mean() print(means) print("mean: ", means.mean())
def learn(): file_name = "td_9" seed = 91 game = Geister2() mcagent = MCAgent(game, seed) opponent = RandomAgent(game, seed+1) env = VsEnv(opponent, game, seed) mcagent.learn(env, seed) # for k in range(6*7*3): # for i in range(3): # for j in range(7): # print((mcagent.w[j+i*(6*7)+k*(6*7*3):6+j+i*(6*7)+k*(6*7*3)] # * 1000).round()*(1/1000)) # print("-----------") # print("-------------------") np.save(file_name, mcagent.w) w_td = np.load(file_name+'.npy') print(w_td.shape)
def battle(): seed = 29 bttl_num = 10 game = Geister2() agents_str = [ "weights/weights_13/reinforce_" + str(i) + "_theta.npy" for i in range(1, 9) ] agent_len = len(agents_str) agents = [REINFORCEAgent(game, seed + i) for i in range(agent_len)] for agent, string in zip(agents, agents_str): agent.theta = load(string) means = np.zeros((agent_len, agent_len)) for i in range(len(agents)): for j in range(i, len(agents)): if i == j: continue r_list = np.zeros(bttl_num) for t in range(bttl_num): agent_s = (agents[i], agents[j]) arr0, arr1 = (agent.init_red() for agent in agent_s) game.__init__() game.setRed(arr0) game.changeSide() game.setRed(arr1) game.changeSide() player = 0 while not game.is_ended(): agent = agent_s[player] states = game.after_states() i_act = agent.get_act_afterstates(states) game.on_action_number_received(i_act) game.changeSide() player = (player + 1) % 2 if player == 1: game.changeSide() result = game.checkResult() r = (1 if (result > 0) else (-1 if (result < 0) else 0)) r_list[t] = r means[i][j] = r_list.mean() means[j][i] = -means[i][j] print(means)
def learn(): file_name = "weights/rfvsrnd6" seed = 103 game = Geister2() agent = REINFORCEAgent(game, seed) agent.w = np.random.randn(agent.W_SIZE) * agent.alpha * 0.0001 agent.theta = np.random.randn(agent.T_SIZE) * agent.beta * 0.0001 opponent = RandomAgent(game, seed + 1) env = VsEnv(opponent, game, seed) # 計測準備 pr = cProfile.Profile() pr.enable() # 計測開始 agent.learn(env, seed) # 計測終了,計測結果出力 pr.disable() stats = pstats.Stats(pr) stats.sort_stats('cumtime') stats.print_stats() pr.dump_stats('profile.stats') # 事後処理 np.save(file_name + "_w", agent.w) np.save(file_name + "_theta", agent.theta)
def test2(): mcagent = MCAgent(Geister2()) mcagent.w = np.load("td_4.npy") print(mcagent.init_red())
def ranking_learn(game): # ranking_leanのデータ読み込み with open(ranking_data_path, 'rt') as fin: cin = csv.reader(fin) datas = [row for row in cin if len(row) > 0] num_weights = int(datas[0][0]) # ランキングデータの読み込み ranking_path = [] ranking_n = [] ranking_r = [] with open(rankings_path, 'rt') as fin: cin = csv.reader(fin) datas = [row for row in cin if len(row) > 0] ranking_path = [row[2] for row in datas] ranking_n = [int(row[3]) for row in datas] ranking_r = [float(row[4]) for row in datas] # ランキングの人数が足りないときはランダムな重みのREINFORCEagentを追加 while len(ranking_path) < num_rankingagents: ranking_path.append(get_path_radom(game)) # path_listの末尾を移動 ranking_n.append(0) ranking_r.append(0) # ランキングの人数が多すぎるときはエラー if len(ranking_path) > num_rankingagents: print("error. ranking_num is over num_rankingagents") # path_list.append(ranking_path.pop(0)) # rankingの末尾を移動 # del ranking_n[0] # del ranking_r[0] # rank agentsの重みの読み込み game = Geister2() train_is = rnd.sample(range(num_rankingagents), num_rankingagents // 2) test_is = [i for i in range(num_rankingagents) if i not in train_is] rank_agents = load_agents(ranking_path, game, None) train_agents = [rank_agents[i] for i in train_is] test_agents = [rank_agents[i] for i in test_is] # 新しいagentの作成 agent = pick_agent(game) agent_path = weights_path + "/rankRF" + str(num_weights) # agnetの学習 env = VsEnvs(train_agents, game, None) # 対戦相手はランダムに一度だけ agent.learn(env, max_episodes=max_episodes) # 最新のランキングに対して改めて対戦を行い,(test_agentsのみ更新) # 基準を満たしていれば,agentのランキングへの追加 results = [] for i in test_is: test_agent = rank_agents[i] # resultはagentの勝率 result = battle(agent, test_agent, bttl_num=bttl_num, seed=None) results.append(result) # 対戦相手の勝率を更新 r_opp = -result ranking_r[i] = (ranking_r[i] * ranking_n[i] + r_opp) / (ranking_n[i] + 1) ranking_n[i] += 1 results = np.array(results) # 基準を満たしている場合(rが一定値以上かつ過半数に対し勝利),ランキングに追加 if (results.mean() > threshold and len(np.where(results > 0)[0]) > num_rankingagents / 2): # ランキングの削除対象(test_agentsのうち勝率が最低のもの) dl_index = ranking_r.index(min([ranking_r[i] for i in test_is])) ranking_path[dl_index] = agent_path ranking_n[dl_index] = ranking_r[dl_index] = 0 # agentのデータの書き込み np.save(agent_path + "_w", agent.w) np.save(agent_path + "_theta", agent.theta) num_weights += 1 # ranking_learnのデータ書き込み with open(ranking_data_path, 'wt') as fout: csvout = csv.writer(fout) datas = [[str(num_weights)]] csvout.writerows(datas) # ランキングデータの書き込み datas = [[str(i + 1), "REINFORCEAgent", ranking_path[i], n, r] for i, n, r in zip(range(len(ranking_path)), ranking_n, ranking_r) ] with open(rankings_path, 'wt') as fout: csvout = csv.writer(fout) csvout.writerows(datas)
# ranking_learnのデータ書き込み with open(ranking_data_path, 'wt') as fout: csvout = csv.writer(fout) datas = [[str(num_weights)]] csvout.writerows(datas) # ランキングデータの書き込み datas = [[str(i + 1), "REINFORCEAgent", ranking_path[i], n, r] for i, n, r in zip(range(len(ranking_path)), ranking_n, ranking_r) ] with open(rankings_path, 'wt') as fout: csvout = csv.writer(fout) csvout.writerows(datas) if __name__ == "__main__": game = Geister2() # 計測準備 pr = cProfile.Profile() pr.enable() # 計測開始 while (True): ranking_learn(game) # 計測終了,計測結果出力 pr.disable() stats = pstats.Stats(pr) stats.sort_stats('cumtime') stats.print_stats() pr.dump_stats('profile.stats') # 結果出力終了
def cluster_learn(): seed = 122 file_name = "weights/weights_16/reinforce_" agents_len = 18 max_episodes = 500 * (agents_len) plt_intvl = 50 * (agents_len) plt_bttl = 200 linestyles = [':', '--', '-.'] # alphaに相当 # linestyle=(0, (1, 0)) plt_colors = ['m', 'r', 'g', 'c', 'b', 'y'] # betaに相当,mマゼンタ(紫),cシアン(青緑) linestyle_avg = '-' plt_color_avg = 'k' alphas = [0.003, 0.005, 0.01] betas = [0.0005, 0.0001, 0.0003, 0.0005, 0.001, 0.0015] assert (len(linestyles) == len(alphas)) assert (len(plt_colors) == len(betas)) assert (len(alphas) * len(betas) == agents_len) game = Geister2() np.random.seed(seed) rnd = random.Random(seed) agents = [REINFORCEAgent(game, seed + i) for i in range(agents_len)] for i in range(len(alphas)): for j in range(len(betas)): agents[i + j * len(alphas)].alpha = alphas[i] agents[i + j * len(alphas)].beta = betas[j] # 重みを小さな正規乱数で初期化 for agent in agents: if agent.w is None: agent.w = np.zeros(agent.W_SIZE) if agent.theta is None: agent.theta = np.zeros(agent.T_SIZE) episodes_x = [] results_y = [[] for _ in range(agents_len)] avg_y = [] rnd_agent = RandomAgent(game, seed * 2 + 1) env = VsEnv(agents[0], game, seed) for episode in range(max_episodes): # 学習個体を一度ずつ選ぶ(順番はランダム) for i in rnd.sample(range(agents_len), agents_len): # -> [2, 0, 1]など # i = rnd.randrange(agents_len) # 学習個体はランダム # # 対戦相手は,全ての候補を一度ずつ選ぶ(順番はランダム) # for j in rnd.sample(range(agents_len), agents_len): j = rnd.randrange(agents_len) # 対戦相手はランダムに一度だけ agent = agents[i] env._opponent = agents[j] agent.learn(env, max_episodes=1) # 定期的にランダムとの対戦結果を描画 if (episode + 1) % plt_intvl == 0: episodes_x.append(episode) plt.clf() opponent = rnd_agent env._opponent = opponent avgs = [] for i in range(agents_len): agent = agents[i] theta = agent.theta r_list = np.zeros(plt_bttl) for bttl_i in range(plt_bttl): afterstates = env.on_episode_begin(agent.init_red()) x = agent.get_x(afterstates) a = agent.get_act(x, theta) for t in range(300): r, nafterstates = env.on_action_number_received(a) if r != 0: break nx = agent.get_x(nafterstates) na = agent.get_act(nx, theta) x = nx a = na r_list[bttl_i] = r mean = r_list.mean() avgs.append(mean) results_y[i].append(mean) plt.figure(1) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Mean Results') x_list = np.array(episodes_x) y_list = np.array(results_y[i]) plt.plot(x_list, y_list, linestyle=linestyles[i % len(alphas)], c=plt_colors[i // len(alphas)], label=str(i)) avg_y.append(np.array(avgs).mean()) plt.figure(1) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Mean Results') x_list = np.array(episodes_x) y_list = np.array(avg_y) plt.plot(x_list, y_list, linestyle=linestyle_avg, c=plt_color_avg, label=agents_len) plt.pause(0.01) # pause a bit so that plots are updated plt.savefig(file_name + str(".png")) plt.show() for i in range(agents_len): np.save(file_name + str(i + 1) + "_w", agents[i].w) np.save(file_name + str(i + 1) + "_theta", agents[i].theta)
def setUp(self): game = Geister2() game.setRed(["E", "F", "G", "H"]) game.changeSide() game.setRed(["E", "F", "G", "H"]) self.game = game
def learn(self, env, seed=1, max_episodes=100000, draw_mode=False, draw_opp=None): alpha = self.alpha beta = self.beta # epsilon = self.epsilon # rnd = self._rnd assert (env.S_SIZE == self.S_SIZE) plt_intvl = 500 plt_bttl = 50 episodes_x = [] results_y = [] dlts_y = [] dlts = [] # 読み込み # mcagent.w = np.load("td_4.npy") # wを小さな正規乱数で初期化 # np.random.seed(seed) # if self.w is None: # self.w = np.random.randn(self.W_SIZE)*alpha*0.1 # if self.theta is None: # self.theta = np.random.randn(self.T_SIZE)*beta*0.1 w = self.w theta = self.theta if draw_mode: denv = VsEnv(draw_opp, game=Geister2(), seed=seed) for episode in range(max_episodes): afterstates = env.on_episode_begin(self.init_red()) xs = self.get_x([env.get_state()])[0] x = self.get_x(afterstates) a = self.get_act(x, theta) xs_list = [xs] x_list = [x] xa_list = [x[a]] for t in range(self.MAX_T): r, nafterstates = env.on_action_number_received(a) if r != 0: break nxs = self.get_x([env.get_state()])[0] nx = self.get_x(nafterstates) na = self.get_act(nx, theta) xs_list.append(nxs) x_list.append(nx) xa_list.append(nx[na]) x = nx a = na for xa, x, xs in zip(xa_list, x_list, xs_list): q = 2 / (1 + np.exp(-np.dot(w, xs))) - 1 dlt = r - q # 報酬予測は事後状態を用いてはならない dlts.append(dlt**2) w += beta * dlt * xs hs = x.dot(theta) hs -= hs.max() # overflow回避のため exps = np.exp(hs) pis = exps / exps.sum() theta += alpha * r * (xa - pis.dot(x)) # 焼きなまし法 # theta += alpha*(episode/max_episodes)*r*(xa - pis.dot(x)) if draw_opp is None and draw_mode: print("not implemented") raise Exception if draw_mode and ((episode + 1) % plt_intvl == 0): dlts_y.append(np.array(dlts).mean()) dlts = [] if draw_opp is not None: denv._opponent = draw_opp r_sum = 0.0 for bttl_i in range(plt_bttl): afterstates = denv.on_episode_begin(self.init_red()) x = self.get_x(afterstates) a = self.get_act(x, theta) for t in range(300): r, nafterstates = denv.on_action_number_received(a) if r != 0: break nx = self.get_x(nafterstates) na = self.get_act(nx, theta) x = nx a = na r_sum += r results_y.append(r_sum / plt_bttl) episodes_x.append(episode) # 一つ目 results plt.figure(2) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Mean Results of Interval') plt.text(50, 0.5, "alpha=" + str(self.alpha)) plt.text(50, 0.4, "beta=" + str(self.beta)) x_list = np.array(episodes_x) y_list = np.array(results_y) plt.plot(x_list, y_list) plt.pause(0.0001) # pause a bit so that plots are updated plt.clf() # 二つ目 予測誤差 Δv(s)^2 plt.figure(1) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Mean Dlt v(s)^2') plt.text(50, 0.5, "alpha=" + str(self.alpha)) plt.text(50, 0.4, "beta=" + str(self.beta)) x_list = np.array(episodes_x) y_list = np.array(dlts_y) plt.plot(x_list, y_list) plt.pause(0.0001) # pause a bit so that plots are updated plt.clf() # 学習終了後 if (draw_mode): # 一つ目 results plt.figure(2) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Mean Results of Interval') plt.text(50, 0.5, "alpha=" + str(self.alpha)) plt.text(50, 0.4, "beta=" + str(self.beta)) x_list = np.array(episodes_x) y_list = np.array(results_y) plt.plot(x_list, y_list) plt.show() # 二つ目 予測誤差 Δv(s)^2 plt.figure(1) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Mean Dlt v(s)^2') plt.text(50, 0.5, "alpha=" + str(self.alpha)) plt.text(50, 0.4, "beta=" + str(self.beta)) x_list = np.array(episodes_x) y_list = np.array(dlts_y) plt.plot(x_list, y_list) plt.show()
game.changeSide() game.setRed(arr1) game.changeSide() player = 0 while not game.is_ended(): agent = agents[player] states = game.after_states() i_act = agent.get_act_afterstates(states) game.on_action_number_received(i_act) game.changeSide() player = (player + 1) % 2 if player == 1: game.changeSide() result = game.checkResult() r = (1 if (result > 0) else (-1 if (result < 0) else 0)) results[t] = r return results.mean() if __name__ == "__main__": seed = 100 geister = Geister2() agents1 = [load_agent("weights/rfvsrnd5", geister, seed)] agents2 = [ load_agent(("weights/weights_13/reinforce_" + str(i)), geister, seed) for i in range(1, 10) ] results = battle2(agents1, agents2, 100) print(results)
def __init__(self, opponent, game=Geister2(), seed=0): self._opponent = opponent self._game = game self._seed = seed self.S_SIZE = (6 * 6 + 6) * 3
def self_play(file_name, agent=None, max_train=50000): seed = 0 max_episodes = max_train plt_intvl = max_episodes + 1 # プロットしない plt_bttl = 200 linestyle = '-' # alphaに相当 # linestyle=(0, (1, 0)) plt_color = 'k' # betaに相当, # mマゼンタ(紫),cシアン(青緑)なども alpha = 0.001 beta = 0.0001 game = Geister2() np.random.seed(seed) if agent is None: agent = REINFORCEAgent(game, seed) agent.w = np.zeros(agent.W_SIZE) agent.theta = np.zeros(agent.T_SIZE) agent.alpha = alpha agent.beta = beta episodes_x = [] results_y = [] rnd_agent = RandomAgent(game, seed*2+1) env = VsEnv(agent, game, seed) denv = VsEnv(rnd_agent, game, seed) for episode in range(max_episodes): agent.alpha = alpha # * (1 - episode/max_episodes) agent.beta = beta # * (1 - episode/max_episodes) agent.learn(env, max_episodes=1) # 定期的にランダムとの対戦結果を描画 if (episode) % plt_intvl == 0: episodes_x.append(episode) plt.clf() opponent = rnd_agent denv._opponent = opponent theta = agent.theta r_list = np.zeros(plt_bttl) for bttl_i in range(plt_bttl): afterstates = denv.on_episode_begin(agent.init_red()) x = agent.get_x(afterstates) a = agent.get_act(x, theta) for t in range(300): r, nafterstates = denv.on_action_number_received(a) if r != 0: break nx = agent.get_x(nafterstates) na = agent.get_act(nx, theta) x = nx a = na r_list[bttl_i] = r mean = r_list.mean() results_y.append(mean) plt.figure(1) plt.title('Training...') plt.xlabel('Episode') plt.ylabel('Mean Results') x_list = np.array(episodes_x) y_list = np.array(results_y) plt.plot(x_list, y_list, linestyle=linestyle, c=plt_color) plt.pause(0.01) # pause a bit so that plots are updated plt.savefig(file_name+str(".png")) # self play向けに変更 np.save(file_name+"_w", agent.w) np.save(file_name+"_theta", agent.theta) # numpyに変換し,グラフの情報を保存 np.save(file_name+"x_list", np.array(x_list)) np.save(file_name+"results_y", np.array(results_y))
def __init__(self, opponents, game=Geister2(), seed=0): self._opponents = opponents opp = rnd.choice(opponents) return super().__init__(opponent=opp, game=game, seed=seed)
def add_in_ranking(path_list): # ランキングデータの読み込み ranking_path = [] ranking_n = [] ranking_r = [] with open('rankings', 'rt') as fin: cin = csv.reader(fin) datas = [row for row in cin if len(row) > 0] ranking_path = [row[2] for row in datas] ranking_n = [int(row[3]) for row in datas] ranking_r = [float(row[4]) for row in datas] # ランキングの人数が足りないときはagentを追加 while len(ranking_path) < agentsnum: ranking_path.append(path_list.pop(0)) # path_listの末尾を移動 ranking_n.append(0) ranking_r.append(0) # ランキングの人数が多すぎるときは追加候補に移動 while len(ranking_path) > agentsnum: path_list.append(ranking_path.pop(0)) # rankingの末尾を移動 del ranking_n[0] del ranking_r[0] # agentの重みの読み込み game = Geister2() agents = load_agents(path_list, game, seed) rank_agents = load_agents(ranking_path, game, seed) # 対戦を行い基準を満たせば追加. for i in range(len(agents)): agent = agents[i] results = [] # 最新のランキングに更新 rank_agents = load_agents(ranking_path, game, seed) for j in range(len(rank_agents)): rank_agent = rank_agents[j] # resultはagentの勝率 result = battle(agent, rank_agent, bttl_num=bttl_num, seed=seed) results.append(result) # 対戦相手の勝率を更新 r_opp = -result ranking_r[j] = (ranking_r[j] * ranking_n[j] + r_opp) / (ranking_n[j] + 1) ranking_n[j] += 1 results = np.array(results) # 基準を満たしていない場合 if (results.mean() <= threshold or len(np.where(results > 0)[0]) <= agentsnum / 2): continue # 基準を満たしている場合(rが一定値以上かつ過半数に対し勝利),ランキングに追加 dl_index = ranking_r.index(min(ranking_r)) # ランキングの削除対象 ranking_path[dl_index] = path_list[i] ranking_n[dl_index] = ranking_r[dl_index] = 0 # データの書き込み datas = [[str(i + 1), "REINFORCEAgent", ranking_path[i], n, r] for i, n, r in zip(range(len(ranking_path)), ranking_n, ranking_r) ] with open('rankings', 'wt') as fout: csvout = csv.writer(fout) csvout.writerows(datas)