def play(protagonist, protagonist_agent_type='greedy', opponent_agent_type='rand', board_size=8, num_rounds=100, protagonist_search_depth=1, opponent_search_depth=1, rand_seed=0, env_init_rand_steps=0, num_disk_as_reward=False, render=True): print('protagonist: {}'.format(protagonist_agent_type)) print('opponent: {}'.format(opponent_agent_type)) protagonist_policy = create_policy(policy_type=protagonist_agent_type, board_size=board_size, seed=rand_seed, search_depth=protagonist_search_depth) opponent_policy = create_policy(policy_type=opponent_agent_type, board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) # disable .run def nop(*args): pass opponent_policy.run = nop if not hasattr(protagonist_policy, 'run'): protagonist_policy.run = nop # if opponent_agent_type == 'human': # render_in_step = True # else: # render_in_step = False env = othello.SimpleOthelloEnv(board_size=board_size, seed=rand_seed, initial_rand_steps=env_init_rand_steps, num_disk_as_reward=num_disk_as_reward, render_in_step=render) win_cnts = draw_cnts = lose_cnts = 0 for i in range(num_rounds): switch = np.random.randint(2) if switch: protagonist = protagonist * -1 policy = {} if protagonist == -1: pcolor = 'BLACK' policy['black'] = protagonist_policy policy['white'] = opponent_policy else: pcolor = 'WHITE' policy['black'] = opponent_policy policy['white'] = protagonist_policy print('Episode {}'.format(i + 1)) print('Protagonist is {}'.format(pcolor)) obs_b = env.reset() state_b = make_state(obs_b, env.player_turn) protagonist_policy.reset(env) opponent_policy.reset(env) if render: env.render() done_b = done_w = False init = True while not (done_b or done_w): # black assert env.player_turn == -1 action_b = policy['black'].get_action(state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env.player_turn) while (not done_b) and env.player_turn == -1: policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) action_b = policy['black'].get_action(next_state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env.player_turn) # learning black policy if not init: policy['white'].run(state_w, action_w, -reward_b, done_b, next_state_b) init = False if done_b: policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) break # white assert env.player_turn == 1 state_w = next_state_b action_w = policy['white'].get_action(state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env.player_turn) while (not done_w) and env.player_turn == 1: policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) action_w = policy['white'].get_action(next_state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env.player_turn) # learning black policy policy['black'].run(state_b, action_b, -reward_w, done_w, next_state_w) if done_w: policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) break state_b = next_state_w if render: env.render() if done_w: reward = reward_w * protagonist elif done_b: reward = reward_b * -protagonist else: raise ValueError print('reward={}'.format(reward)) if num_disk_as_reward: total_disks = board_size**2 if protagonist == 1: white_cnts = (total_disks + reward) / 2 black_cnts = total_disks - white_cnts if white_cnts > black_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: black_cnts = (total_disks + reward) / 2 white_cnts = total_disks - black_cnts if black_cnts > white_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: if reward == 1: win_cnts += 1 elif reward == 0: draw_cnts += 1 else: lose_cnts += 1 print('-' * 3) print('#Wins: {}, #Draws: {}, #Loses: {}'.format( win_cnts, draw_cnts, lose_cnts)) env.close()
def test( protagonist, protagonist_agent_type, opponent_agent_type, board_size, num_rounds, protagonist_search_depth, opponent_search_depth, rand_seed, env_init_rand_steps, num_disk_as_reward=True, test_init_rand_steps=10, render=False, train_teacher=True, teacher_test_interval=2000, test_interval=10, num_test_games=200, save_interval=500, # load_path='data/selfplay/ent0_lr1e-5_35000.pth'): # load_path='/data/unagi0/omura/othello/selfplay/ent0_lr1e-5_35000.pth'): load_path='/data/unagi0/omura/othello/selfplay/ent0_lr1e-5_numstep64_45000.pth' ): # load_path = '/data/unagi0/omura/othello/teacher_student/testinterval10_ent0_lr5e-6_clip1e-1_numstep64_teacher_10000.pth'): args = get_args() args.algo = 'ppo' args.use_gae = True args.lr = 5e-6 #2.5e-4 args.clip_param = 0.1 args.value_loss_coef = 0.5 #0.5 args.num_processes = 8 args.num_steps = 64 args.num_mini_batch = 4 args.log_interval = 1 args.use_linear_lr_decay = True args.entropy_coef = 0.0 # 0.01 print(args) step_per_episode = 32 # num_rounds_per_proc = num_rounds // args.num_processes num_updates = (num_rounds * step_per_episode) // args.num_steps # torch.manual_seed(args.seed) # torch.cuda.manual_seed_all(args.seed) # # if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: # torch.backends.cudnn.benchmark = False # torch.backends.cudnn.deterministic = True # # log_dir = os.path.expanduser(args.log_dir) # eval_log_dir = log_dir + "_eval" # utils.cleanup_log_dir(log_dir) # utils.cleanup_log_dir(eval_log_dir) # torch.set_num_threads(1) # device = torch.device("cuda:0" if args.cuda else "cpu") device = torch.device("cpu") # agent_name = 'wo_ttrain_ent0_lr5e-6_clip1e-1_numstep64_3rd' # agent_name = 'testinterval1_ent0_lr5e-6_clip1e-1_numstep64_2nd' agent_name = 'testinterval10_ent0_lr5e-6_clip1e-1_numstep64_3rd' # agent_name = 'trained1_10k_wo_ttrain_ent0_lr5e-6_clip1e-1_numstep64' # agent_name = 'trained1_10k_testinterval10_ent0_lr5e-6_clip1e-1_numstep64' # agent_name = 'test' writer = SummaryWriter( log_dir="./log/ppo_teacher_vs_student/{}".format(agent_name)) envs_list = [] for i in range(args.num_processes): env = othello.SimpleOthelloEnv(board_size=board_size, seed=i, initial_rand_steps=env_init_rand_steps, num_disk_as_reward=num_disk_as_reward, render_in_step=render) env.rand_steps_holder = env_init_rand_steps env.test_rand_steps_holder = test_init_rand_steps envs_list.append(env) obs_space = spaces.Box(np.zeros((4, 8, 8)), np.ones((4, 8, 8))) action_space = spaces.Discrete(board_size**2) if load_path: actor_critic_teacher = torch.load(load_path) else: actor_critic_teacher = Policy( obs_space.shape, action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic_student = Policy( obs_space.shape, action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic_student.to(device) actor_critic_teacher.to(device) envs = PPOTeacherStudentEnvs(envs_list, othello_teacher_vs_student, actor_critic_teacher, actor_critic_student, device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent_teacher = algo.PPO(actor_critic_teacher, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) agent_student = algo.PPO(actor_critic_student, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent_teacher = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) agent_student = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts_teacher = RolloutStorage( args.num_steps, args.num_processes, obs_space.shape, action_space, actor_critic_teacher.recurrent_hidden_state_size) rollouts_student = RolloutStorage( args.num_steps, args.num_processes, obs_space.shape, action_space, actor_critic_student.recurrent_hidden_state_size) # episode_rewards = deque(maxlen=10) update_t = 0 update_s = 0 win_avg = {'rand': 0, 'greedy': 0} last_win_avg = {'rand': 0, 'greedy': 0} obs_ts = [[0] * args.num_processes, [0] * args.num_processes] action_ts = [[0] * args.num_processes, [0] * args.num_processes] reward_ts = [[0] * args.num_processes, [0] * args.num_processes] done_ts = [[0] * args.num_processes, [0] * args.num_processes] infos_ts = [[0] * args.num_processes, [0] * args.num_processes] v_logprob_hidden_ts = [[0] * args.num_processes, [0] * args.num_processes] masks_ts = [[0] * args.num_processes, [0] * args.num_processes] bad_masks_ts = [[0] * args.num_processes, [0] * args.num_processes] choices_ts = [[0] * args.num_processes, [0] * args.num_processes] def save_to_buffer(who_, idx, obs_, action_, reward_, done_, infos_, v_logprob_hidden_, masks_, bad_masks_, choices_): if who_ == 'teacher': ts = 0 else: ts = 1 obs_ts[ts][idx] = obs_[idx] action_ts[ts][idx] = action_[idx] reward_ts[ts][idx] = reward_[idx] done_ts[ts][idx] = done_[idx] infos_ts[ts][idx] = infos_[idx] v_logprob_hidden_ts[ts][idx] = v_logprob_hidden_[idx] masks_ts[ts][idx] = masks_[idx] bad_masks_ts[ts][idx] = bad_masks_[idx] choices_ts[ts][idx] = choices_[idx] student_buffer = {} teacher_buffer = {} for i in range(args.num_processes): student_buffer[i] = 0 teacher_buffer[i] = 0 teacher_step = 0 student_step = 0 for episode in range(num_rounds): print() print('Episode %s' % episode) teacher = random.choice([1, -1]) envs.reset(teacher, win_avg, last_win_avg) over = False done_ts = [[0] * args.num_processes, [0] * args.num_processes] # teacher_step = 0 # student_step = 0 accum_reward_s = np.zeros(args.num_processes) accum_reward_t = np.zeros(args.num_processes) while not over: over = all(np.array(done_ts[0]) + np.array(done_ts[1])) # Observe reward and next obs # if not over: t_or_s, obs, action, reward, done, infos, v_logprob_hidden, masks, bad_masks = envs.step( rollouts_student.recurrent_hidden_states[student_step % args.num_steps], rollouts_teacher.recurrent_hidden_states[teacher_step % args.num_steps]) # print('@', over, t_or_s, done, reward.squeeze()) # print(action.squeeze()) choices = [info['choices'] for info in infos] # for i in range(len(action)): # assert done[i] or action[i][0] in choices[i], (action[i][0], choices[i]) for i, who in enumerate(t_or_s): save_to_buffer(who, i, obs, action, reward, done, infos, v_logprob_hidden, masks, bad_masks, choices) if who == 'teacher': teacher_buffer[i] = 1 else: student_buffer[i] = 1 # print(action_ts, choices_ts) if all(list(teacher_buffer.values())) or over: obs_t = torch.stack(obs_ts[0]) action_t = torch.stack(action_ts[0]) reward_t = torch.stack(reward_ts[0]) # done_t = done_ts[0] # infos_t = infos_ts[0] v_logprob_hidden_t = torch.stack(v_logprob_hidden_ts[0]) masks_t = torch.stack(masks_ts[0]) bad_masks_t = torch.stack(bad_masks_ts[0]) choices_t = copy.deepcopy(choices_ts[0]) accum_reward_t = accum_reward_t + np.array(reward_t.squeeze()) # print('t', accum_reward_t, np.array(reward_t.squeeze())) if teacher_step == 0: rollouts_teacher.obs[0].copy_(obs_t) rollouts_teacher.masks[0].copy_(masks_t) rollouts_teacher.bad_masks[0].copy_(bad_masks_t) else: rollouts_teacher.insert(obs_t, prev_hidden_t, prev_action_t, prev_logprob_t, prev_value_t, prev_reward_t, masks_t, bad_masks_t, prev_choices_t) prev_action_t = action_t prev_value_t = v_logprob_hidden_t[:, 0].unsqueeze(1) prev_logprob_t = v_logprob_hidden_t[:, 1].unsqueeze(1) prev_hidden_t = v_logprob_hidden_t[:, 2].unsqueeze(1) prev_reward_t = reward_t # prev_masks = masks # prev_bad_masks = bad_masks prev_choices_t = copy.deepcopy(choices_t) # over_t = all(done_t) teacher_step += 1 for i in range(args.num_processes): teacher_buffer[i] = 0 if all(list(student_buffer.values())) or over: obs_s = torch.stack(obs_ts[1]) action_s = torch.stack(action_ts[1]) reward_s = torch.stack(reward_ts[1]) # done_s = done_ts[1] # infos_s = infos_ts[1] v_logprob_hidden_s = torch.stack(v_logprob_hidden_ts[1]) masks_s = torch.stack(masks_ts[1]) bad_masks_s = torch.stack(bad_masks_ts[1]) choices_s = copy.deepcopy(choices_ts[1]) accum_reward_s = accum_reward_s + np.array(reward_s.squeeze()) # print('s', accum_reward_s, np.array(reward_s.squeeze())) if student_step == 0: rollouts_student.obs[0].copy_(obs_s) rollouts_student.masks[0].copy_(masks_s) rollouts_student.bad_masks[0].copy_(bad_masks_s) else: rollouts_student.insert(obs_s, prev_hidden_s, prev_action_s, prev_logprob_s, prev_value_s, prev_reward_s, masks_s, bad_masks_s, prev_choices_s) prev_action_s = action_s prev_value_s = v_logprob_hidden_s[:, 0].unsqueeze(1) prev_logprob_s = v_logprob_hidden_s[:, 1].unsqueeze(1) prev_hidden_s = v_logprob_hidden_s[:, 2].unsqueeze(1) prev_reward_s = reward_s # prev_masks = masks # prev_bad_masks = bad_masks prev_choices_s = copy.deepcopy(choices_s) # over_s = all(done_s) student_step += 1 for i in range(args.num_processes): student_buffer[i] = 0 if (teacher_step % args.num_steps == 0) and (teacher_step != 0): if train_teacher: with torch.no_grad(): next_value_teacher = actor_critic_teacher.get_value( rollouts_teacher.obs[-1], rollouts_teacher.recurrent_hidden_states[-1], rollouts_teacher.masks[-1]).detach() rollouts_teacher.compute_returns( next_value_teacher, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss_teacher, action_loss_teacher, dist_entropy_teacher = agent_teacher.update( rollouts_teacher) rollouts_teacher.after_update() if args.use_linear_lr_decay: utils.update_linear_schedule( agent_teacher.optimizer, update_t, num_updates, agent_teacher.optimizer.lr if args.algo == "acktr" else args.lr) update_t += 1 # teacher_step = 0 if (student_step % args.num_steps == 0) and (student_step != 0): with torch.no_grad(): next_value_student = actor_critic_student.get_value( rollouts_student.obs[-1], rollouts_student.recurrent_hidden_states[-1], rollouts_student.masks[-1]).detach() rollouts_student.compute_returns(next_value_student, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss_student, action_loss_student, dist_entropy_student = agent_student.update( rollouts_student) rollouts_student.after_update() if args.use_linear_lr_decay: utils.update_linear_schedule( agent_student.optimizer, update_s, num_updates, agent_student.optimizer.lr if args.algo == "acktr" else args.lr) update_s += 1 # student_step = 0 if over: student_wins = 0 print('reward') print(accum_reward_s) for r in accum_reward_s: if r > 0: student_wins += 1 student_win_percent = student_wins / len(accum_reward_s) # over = all(done_ts[0]) and all(done_ts[1]) # over = all(np.array(done_ts[0])+np.array(done_ts[1])) if episode % test_interval == 0: print('Test') games_rand, wins_rand = envs.test( 'rand', num_test_games, rollouts_student.recurrent_hidden_states[0]) writer.add_scalar("win avg({})".format('rand'), wins_rand / games_rand, episode) print('### vs-random winning% {}/{}={}'.format( wins_rand, games_rand, wins_rand / games_rand)) games_greedy, wins_greedy = envs.test( 'greedy', num_test_games, rollouts_student.recurrent_hidden_states[0]) writer.add_scalar("win avg({})".format('greedy'), wins_greedy / games_greedy, episode) print('### vs-greedy winning% {}/{}={}'.format( wins_greedy, games_greedy, wins_greedy / games_greedy)) last_win_avg = copy.deepcopy(win_avg) win_avg['rand'] = wins_rand / games_rand win_avg['greedy'] = wins_greedy / games_greedy if episode % teacher_test_interval == 0: print('Test teacher') games_rand, wins_rand = envs.test( 'rand', num_test_games, rollouts_teacher.recurrent_hidden_states[0], teacher=True) writer.add_scalar("win avg teacher({})".format('rand'), wins_rand / games_rand, episode) print('### vs-random winning% {}/{}={}'.format( wins_rand, games_rand, wins_rand / games_rand)) games_greedy, wins_greedy = envs.test( 'greedy', num_test_games, rollouts_teacher.recurrent_hidden_states[0], teacher=True) writer.add_scalar("win avg teacher({})".format('greedy'), wins_greedy / games_greedy, episode) print('### vs-greedy winning% {}/{}={}'.format( wins_greedy, games_greedy, wins_greedy / games_greedy)) last_win_avg = copy.deepcopy(win_avg) win_avg['rand'] = wins_rand / games_rand win_avg['greedy'] = wins_greedy / games_greedy if episode % save_interval == 0: if os.path.exists('/data/unagi0/omura'): t_save_path = '/data/unagi0/omura/othello/teacher_student/{}_teacher_{}.pth'.format( agent_name, episode) s_save_path = '/data/unagi0/omura/othello/teacher_student/{}_student_{}.pth'.format( agent_name, episode) else: t_save_path = 'data/selfplay/{}_teacher_{}.pth'.format( agent_name, episode) s_save_path = 'data/selfplay/{}_student_{}.pth'.format( agent_name, episode) torch.save(actor_critic_teacher, t_save_path) torch.save(actor_critic_student, s_save_path) if teacher_step > args.num_steps and student_step > args.num_steps: writer.add_scalar("value_loss_student", value_loss_student, episode) writer.add_scalar("action_loss_student", action_loss_student, episode) writer.add_scalar("dist_entropy_student", dist_entropy_student, episode) writer.add_scalar("student_win_percent", student_win_percent, episode) if train_teacher: writer.add_scalar("value_loss_teacher", value_loss_teacher, episode) writer.add_scalar("action_loss_teacher", action_loss_teacher, episode) writer.add_scalar("dist_entropy_teacher", dist_entropy_teacher, episode) # print(value_loss, action_loss, dist_entropy) envs.over()
def test( protagonist, protagonist_agent_type, opponent_agent_type, board_size, num_rounds, protagonist_search_depth, opponent_search_depth, rand_seed, env_init_rand_steps, test_init_rand_steps=10, num_disk_as_reward=True, render=False, test_interval=500, num_test_games=200, save_interval=500, # load_path='data/selfplay/rainbow_selfplay_350000.pth'): load_path=''): args = get_args() args.algo = 'ppo' args.use_gae = True args.lr = 1e-5 #2.5e-4 args.clip_param = 0.1 args.value_loss_coef = 0.5 #0.5 args.num_processes = 8 args.num_steps = 64 #128 args.num_mini_batch = 4 args.log_interval = 1 args.use_linear_lr_decay = True args.entropy_coef = 0 # 0.01 print(args) step_per_episode = 32 # num_rounds_per_proc = num_rounds // args.num_processes num_updates = (num_rounds * step_per_episode) // args.num_steps # torch.manual_seed(args.seed) # torch.cuda.manual_seed_all(args.seed) # # if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: # torch.backends.cudnn.benchmark = False # torch.backends.cudnn.deterministic = True # # log_dir = os.path.expanduser(args.log_dir) # eval_log_dir = log_dir + "_eval" # utils.cleanup_log_dir(log_dir) # utils.cleanup_log_dir(eval_log_dir) # torch.set_num_threads(1) # device = torch.device("cuda:0" if args.cuda else "cpu") device = torch.device("cpu") # agent_name = 'ppo_selfplay_8proc_th1e-10_ent1e-2' # agent_name = 'ent0_lr1e-5_clip2e-1' agent_name = 'ent0_lr1e-5_numstep64' # agent_name = 'test' writer = SummaryWriter(log_dir="./log/ppo_selfplay/{}".format(agent_name)) envs_list = [] for i in range(args.num_processes): env = othello.SimpleOthelloEnv(board_size=board_size, seed=i, initial_rand_steps=env_init_rand_steps, num_disk_as_reward=num_disk_as_reward, render_in_step=render) env.rand_steps_holder = env_init_rand_steps env.test_rand_steps_holder = test_init_rand_steps envs_list.append(env) obs_space = spaces.Box(np.zeros((4, 8, 8)), np.ones((4, 8, 8))) action_space = spaces.Discrete(board_size**2) if load_path: actor_critic = torch.load(load_path) else: actor_critic = Policy(obs_space.shape, action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) envs = PPOEnvs(envs_list, subproc_worker, actor_critic, device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_space.shape, action_space, actor_critic.recurrent_hidden_state_size) # episode_rewards = deque(maxlen=10) u = 0 step = 0 for episode in range(num_rounds): print() print('Episode %s' % episode) envs.reset() over = False while not over: if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, u, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) # for step in range(args.num_steps): # Obser reward and next obs # if not over: obs, action, reward, done, infos, v_logprob_hidden, masks, bad_masks = envs.step( rollouts.recurrent_hidden_states[step % args.num_steps]) choices = [info['choices'] for info in infos] for i in range(len(action)): assert done[i] or action[i][0] in choices[i], (action[i][0], choices[i]) # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) if step == 0: rollouts.obs[0].copy_(obs) rollouts.masks[0].copy_(masks) rollouts.bad_masks[0].copy_(bad_masks) else: rollouts.insert(obs, prev_hidden, prev_action, prev_logprob, prev_value, prev_reward, masks, bad_masks, prev_choices) # prev_obs = obs prev_action = action prev_value = v_logprob_hidden[:, 0].unsqueeze(1) prev_logprob = v_logprob_hidden[:, 1].unsqueeze(1) prev_hidden = v_logprob_hidden[:, 2].unsqueeze(1) prev_reward = reward # prev_masks = masks # prev_bad_masks = bad_masks prev_choices = choices over = all(done) if (step % args.num_steps == 0) and (step != 0): u += 1 with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() step += 1 if episode % test_interval == 0: games, wins = envs.test('rand', num_test_games, rollouts.recurrent_hidden_states[0]) writer.add_scalar("win%({})".format('rand'), wins / games, episode) print('### vs-random winning% {}/{}={}'.format( wins, games, wins / games)) games, wins = envs.test('greedy', num_test_games, rollouts.recurrent_hidden_states[0]) writer.add_scalar("win%({})".format('greedy'), wins / games, episode) print('### vs-greedy winning% {}/{}={}'.format( wins, games, wins / games)) if episode % save_interval == 0: if os.path.exists('/data/unagi0/omura'): save_path = '/data/unagi0/omura/othello/selfplay/{}_{}.pth'.format( agent_name, episode) else: save_path = 'data/selfplay/{}_{}.pth'.format( agent_name, episode) torch.save(actor_critic, save_path) if step > args.num_steps: writer.add_scalar("value_loss", value_loss, episode) writer.add_scalar("action_loss", action_loss, episode) writer.add_scalar("dist_entropy", dist_entropy, episode) print(value_loss, action_loss, dist_entropy) envs.over()
def play( protagonist, protagonist_agent_type='greedy', opponent_agent_type='rand', board_size=8, num_rounds=300000, protagonist_search_depth=1, opponent_search_depth=1, rand_seed=0, env_init_rand_steps=0, test_init_rand_steps=10, num_disk_as_reward=True, render=False, test_interval=2500, num_test_games=200, save_interval=5000, # load_path='data/selfplay/rainbow_selfplay_350000.pth'): load_path=''): print('protagonist: {}'.format(protagonist_agent_type)) print('opponent: {}'.format(opponent_agent_type)) agent_name = 'rainbow_selfplay_2nd' protagonist_policy = create_policy(policy_type=protagonist_agent_type, board_size=board_size, seed=rand_seed, search_depth=protagonist_search_depth, agent_name=agent_name) opponent_policy1 = create_policy(policy_type='rand', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policy2 = create_policy(policy_type='greedy', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policies = [('rand', opponent_policy1), ('greedy', opponent_policy2)] # disable .run # def nop(*args): # pass # opponent_policy.run = nop # if not hasattr(protagonist_policy, 'run'): # protagonist_policy.run = nop # if opponent_agent_type == 'human': # render_in_step = True # else: # render_in_step = False if load_path: print('Load {} ...'.format(load_path)) start_episode, loss = load(protagonist_policy, load_path) else: start_episode = 0 env = othello.SimpleOthelloEnv(board_size=board_size, seed=rand_seed, initial_rand_steps=env_init_rand_steps, num_disk_as_reward=num_disk_as_reward, render_in_step=render) win_cnts = draw_cnts = lose_cnts = 0 for i in range(start_episode, num_rounds): switch = np.random.randint(2) if switch: protagonist = protagonist * -1 policy = {} if protagonist == -1: pcolor = 'black' policy['black'] = protagonist_policy policy['white'] = protagonist_policy else: pcolor = 'white' policy['black'] = protagonist_policy policy['white'] = protagonist_policy print('Episode {}'.format(i + 1)) print('Protagonist is {}'.format(pcolor)) obs_b = env.reset() state_b = make_state(obs_b, env) protagonist_policy.reset(env) # opponent_policy.reset(env) if render: env.render() done_b = done_w = False init = True while not (done_b or done_w): # black assert env.player_turn == -1 # action_b = policy['black'].get_action(state_b) action_b = action('black', pcolor, state_b, policy) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: if pcolor == 'black': policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) # action_b = policy['black'].get_action(next_state_b) action_b = action('black', pcolor, next_state_b, policy) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) # learning black policy if not init: if pcolor == 'white': policy['white'].run(state_w, action_w, -reward_b, done_b, next_state_b) init = False if done_b: if pcolor == 'black': policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) break # white assert env.player_turn == 1 state_w = next_state_b # action_w = policy['white'].get_action(state_w) action_w = action('white', pcolor, state_w, policy) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: if pcolor == 'white': policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) # action_w = policy['white'].get_action(next_state_w) action_w = action('white', pcolor, next_state_w, policy) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) # learning black policy if pcolor == 'black': policy['black'].run(state_b, action_b, -reward_w, done_w, next_state_w) if done_w: if pcolor == 'white': policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) break state_b = next_state_w if render: env.render() if done_w: reward = reward_w * protagonist elif done_b: reward = reward_b * -protagonist else: raise ValueError print('reward={}'.format(reward)) if num_disk_as_reward: total_disks = board_size**2 if protagonist == 1: white_cnts = (total_disks + reward) / 2 black_cnts = total_disks - white_cnts if white_cnts > black_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: black_cnts = (total_disks + reward) / 2 white_cnts = total_disks - black_cnts if black_cnts > white_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: if reward == 1: win_cnts += 1 elif reward == 0: draw_cnts += 1 else: lose_cnts += 1 print('-' * 3) print('#Wins: {}, #Draws: {}, #Loses: {}'.format( win_cnts, draw_cnts, lose_cnts)) # calc student's winning % if i % test_interval == 0: env.initial_rand_steps = test_init_rand_steps for name, opponent_policy in opponent_policies: wins = 0 protagonist = -1 for j in range(num_test_games): switch = np.random.randint(2) if switch: protagonist = protagonist * -1 policy = {} if protagonist == -1: pcolor = 'BLACK' policy['black'] = protagonist_policy policy['white'] = opponent_policy else: pcolor = 'WHITE' policy['black'] = opponent_policy policy['white'] = protagonist_policy obs_b = env.reset() state_b = make_state(obs_b, env) protagonist_policy.reset(env) opponent_policy.reset(env) if render: env.render() done_b = done_w = False while not (done_b or done_w): # black assert env.player_turn == -1 action_b = policy['black'].get_test_action(state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) action_b = policy['black'].get_test_action( next_state_b) next_obs_b, reward_b, done_b, _ = env.step( action_b) next_state_b = make_state(next_obs_b, env) if done_b: break # white assert env.player_turn == 1 state_w = next_state_b action_w = policy['white'].get_test_action(state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) action_w = policy['white'].get_test_action( next_state_w) next_obs_w, reward_w, done_w, _ = env.step( action_w) next_state_w = make_state(next_obs_w, env) if done_w: break state_b = next_state_w if done_w: reward = reward_w * protagonist elif done_b: reward = reward_b * -protagonist else: raise ValueError if reward > 0: wins += 1 # last_win_per = win_per win_per = wins / num_test_games print() print('win % ({}):'.format(name), win_per) print() protagonist_policy.writer.add_scalar("win%({})".format(name), win_per, i) env.initial_rand_steps = env_init_rand_steps if i % save_interval == 0: if os.path.exists('/data/unagi0/omura'): save_path = '/data/unagi0/omura/othello/selfplay/{}_{}.pth'.format( agent_name, i) else: save_path = 'data/selfplay/{}_{}.pth'.format(agent_name, i) save(i, protagonist_policy, 0, save_path) env.close()
def play( teacher, teacher_agent_type='rainbow', student_agent_type='rainbow', opponent_agent_type='', board_size=8, num_rounds=400000, teacher_search_depth=1, student_search_depth=1, opponent_search_depth=1, rand_seed=0, env_init_rand_steps=0, test_init_rand_steps=10, num_disk_as_reward=True, render=False, train_teacher=False, test_interval=2500, num_test_games=200, teacher_train_steps=5000, save_interval=5000, # load_path='', # load_path='data/selfplay/rainbow_selfplay_350000.pth', # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_350000.pth', # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_2nd_65000.pth', load_path='/data/unagi0/omura/othello/teacher_student/rainbow_gre_rand_teacher_train_10interval_mp_59999.pth', num_process=1): print('teacher: {}'.format(teacher_agent_type)) print('student: {}'.format(student_agent_type)) print('opponent: {}'.format(opponent_agent_type)) agent_name_teacher = 'rainbow_gre_rand_teacher_notrain_mp_load_teacher60k' agent_name_student = 'rainbow_gre_rand_student_notrain_mp_load_teacher60k' # agent_name_teacher = 'test' # agent_name_student = 'test' # load_path = '' teacher_policy = create_policy(policy_type=teacher_agent_type, board_size=board_size, seed=rand_seed, search_depth=teacher_search_depth, agent_name=agent_name_teacher) student_policy = create_policy(policy_type=student_agent_type, board_size=board_size, seed=rand_seed, search_depth=student_search_depth, agent_name=agent_name_student) opponent_policy1 = create_policy(policy_type='rand', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policy2 = create_policy(policy_type='greedy', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policies = [('rand', opponent_policy1), ('greedy', opponent_policy2)] # opponent_policies = [('greedy', opponent_policy1)] if not train_teacher: def noop(*args): pass teacher_policy.run = noop # if not hasattr(protagonist_policy, 'run'): # protagonist_policy.run = nop # if opponent_agent_type == 'human': # render_in_step = True # else: # render_in_step = False if load_path: print('Load {} ...'.format(load_path)) start_episode, loss = load(teacher_policy, load_path) else: start_episode = 0 env = othello.SimpleOthelloEnv(board_size=board_size, seed=rand_seed, initial_rand_steps=env_init_rand_steps, num_disk_as_reward=num_disk_as_reward, render_in_step=render) # # env_test = othello.OthelloEnv( # board_size=board_size, # seed=rand_seed, # initial_rand_steps=env_init_rand_steps, # num_disk_as_reward=num_disk_as_reward, # render_in_step=render) win_cnts = draw_cnts = lose_cnts = 0 win_per = {'rand': 0, 'greedy': 0} last_win_per = {'rand': 0, 'greedy': 0} teacher_queue = queue.Queue() # for i in range(start_episode, num_rounds): for i in range(num_rounds): switch = np.random.randint(2) if switch: teacher = teacher * -1 policy = {} if teacher == -1: tcolor = 'black' policy['black'] = teacher_policy policy['white'] = student_policy else: tcolor = 'white' policy['black'] = student_policy policy['white'] = teacher_policy print('Episode {}'.format(i + 1)) print('Teacher is {}'.format(tcolor)) def run(color, state, action, reward, done, next_state): if color == tcolor: if done: teacher_reward = 0 for k in win_per.keys(): teacher_reward += win_per[k] - last_win_per[k] else: teacher_reward = 0 if student_policy.is_learning(): # print('### learning') teacher_queue.put( (state, action, teacher_reward, done, next_state)) else: policy[color].run(state, action, reward, done, next_state) obs_b = env.reset() state_b = make_state(obs_b, env) teacher_policy.reset(env) student_policy.reset(env) if render: env.render() done_b = done_w = False init = True # student_policy.win_queue = mp.Queue(num_process) while not (done_b or done_w): # black assert env.player_turn == -1 action_b = policy['black'].get_action(state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) run('black', state_b, action_b, reward_b, done_b, next_state_b) action_b = policy['black'].get_action(next_state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) # learning black policy if not init: # policy['white'].run(state_w, action_w, - reward_b, done_b, next_state_b) run('white', state_w, action_w, -reward_b, done_b, next_state_b) init = False if done_b: # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) run('black', state_b, action_b, reward_b, done_b, next_state_b) break # white assert env.player_turn == 1 state_w = next_state_b action_w = policy['white'].get_action(state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) run('white', state_w, action_w, reward_w, done_w, next_state_w) action_w = policy['white'].get_action(next_state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) # learning black policy # policy['black'].run(state_b, action_b, - reward_w, done_w, next_state_w) run('black', state_b, action_b, -reward_w, done_w, next_state_w) if done_w: # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) run('white', state_w, action_w, reward_w, done_w, next_state_w) break state_b = next_state_w if render: env.render() if done_w: reward = reward_w * teacher elif done_b: reward = reward_b * -teacher else: raise ValueError print('reward={}'.format(reward)) if num_disk_as_reward: total_disks = board_size**2 if teacher == 1: white_cnts = (total_disks + reward) / 2 black_cnts = total_disks - white_cnts if white_cnts > black_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: black_cnts = (total_disks + reward) / 2 white_cnts = total_disks - black_cnts if black_cnts > white_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: if reward == 1: win_cnts += 1 elif reward == 0: draw_cnts += 1 else: lose_cnts += 1 print('-' * 3) print('#Wins: {}, #Draws: {}, #Loses: {}'.format( win_cnts, draw_cnts, lose_cnts)) if teacher_queue.qsize() >= teacher_train_steps: while not teacher_queue.empty(): trans = teacher_queue.get() teacher_policy.run(*trans) # calc student's winning % args_student = (student_agent_type, board_size, rand_seed, student_search_depth, 'calc_win_rate') state_dict = student_policy.network_state_dict() if i % test_interval == 0: env.initial_rand_steps = test_init_rand_steps for name, opponent_policy in opponent_policies: win_queue = mp.Queue(num_process) p_games = num_test_games // num_process total_games = p_games * num_process ps = [] student_policies = [] # for j in range(num_process): # student_policies.append(copy.deepcopy(student_policy)) for j in range(num_process): ps.append( mp.Process(target=calc_win, args=(env, p_games, args_student, state_dict, opponent_policy, win_queue))) for p in ps: p.start() # time.sleep(0.5) for p in ps: p.join() # assert win_queue.qsize() == num_process total_wins = 0 for _ in range(num_process): total_wins += win_queue.get() last_win_per[name] = win_per[name] win_per[name] = total_wins / total_games student_policy.writer.add_scalar("win%({})".format(name), win_per[name], i) print() print('last win%:', last_win_per) print('win%:', win_per) print() env.initial_rand_steps = env_init_rand_steps if (i + 1) % save_interval == 0: teacher_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format( agent_name_teacher, i + 1) student_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format( agent_name_student, i + 1) # teacher_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_teacher, i) # student_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_student, i) save(i, teacher_policy, 0, teacher_path) save(i, student_policy, 0, student_path) env.close()