Esempio n. 1
0
def play(protagonist,
         protagonist_agent_type='greedy',
         opponent_agent_type='rand',
         board_size=8,
         num_rounds=100,
         protagonist_search_depth=1,
         opponent_search_depth=1,
         rand_seed=0,
         env_init_rand_steps=0,
         num_disk_as_reward=False,
         render=True):
    print('protagonist: {}'.format(protagonist_agent_type))
    print('opponent: {}'.format(opponent_agent_type))

    protagonist_policy = create_policy(policy_type=protagonist_agent_type,
                                       board_size=board_size,
                                       seed=rand_seed,
                                       search_depth=protagonist_search_depth)
    opponent_policy = create_policy(policy_type=opponent_agent_type,
                                    board_size=board_size,
                                    seed=rand_seed,
                                    search_depth=opponent_search_depth)

    # disable .run
    def nop(*args):
        pass

    opponent_policy.run = nop
    if not hasattr(protagonist_policy, 'run'):
        protagonist_policy.run = nop

    # if opponent_agent_type == 'human':
    #     render_in_step = True
    # else:
    #     render_in_step = False

    env = othello.SimpleOthelloEnv(board_size=board_size,
                                   seed=rand_seed,
                                   initial_rand_steps=env_init_rand_steps,
                                   num_disk_as_reward=num_disk_as_reward,
                                   render_in_step=render)

    win_cnts = draw_cnts = lose_cnts = 0
    for i in range(num_rounds):
        switch = np.random.randint(2)
        if switch:
            protagonist = protagonist * -1

        policy = {}
        if protagonist == -1:
            pcolor = 'BLACK'
            policy['black'] = protagonist_policy
            policy['white'] = opponent_policy
        else:
            pcolor = 'WHITE'
            policy['black'] = opponent_policy
            policy['white'] = protagonist_policy

        print('Episode {}'.format(i + 1))
        print('Protagonist is {}'.format(pcolor))

        obs_b = env.reset()
        state_b = make_state(obs_b, env.player_turn)
        protagonist_policy.reset(env)
        opponent_policy.reset(env)
        if render:
            env.render()
        done_b = done_w = False
        init = True
        while not (done_b or done_w):
            # black
            assert env.player_turn == -1
            action_b = policy['black'].get_action(state_b)
            next_obs_b, reward_b, done_b, _ = env.step(action_b)
            next_state_b = make_state(next_obs_b, env.player_turn)
            while (not done_b) and env.player_turn == -1:
                policy['black'].run(state_b, action_b, reward_b, done_b,
                                    next_state_b)
                action_b = policy['black'].get_action(next_state_b)
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env.player_turn)

            # learning black policy
            if not init:
                policy['white'].run(state_w, action_w, -reward_b, done_b,
                                    next_state_b)
            init = False
            if done_b:
                policy['black'].run(state_b, action_b, reward_b, done_b,
                                    next_state_b)
                break

            # white
            assert env.player_turn == 1
            state_w = next_state_b
            action_w = policy['white'].get_action(state_w)
            next_obs_w, reward_w, done_w, _ = env.step(action_w)
            next_state_w = make_state(next_obs_w, env.player_turn)
            while (not done_w) and env.player_turn == 1:
                policy['white'].run(state_w, action_w, reward_w, done_w,
                                    next_state_w)
                action_w = policy['white'].get_action(next_state_w)
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env.player_turn)

            # learning black policy
            policy['black'].run(state_b, action_b, -reward_w, done_w,
                                next_state_w)
            if done_w:
                policy['white'].run(state_w, action_w, reward_w, done_w,
                                    next_state_w)
                break

            state_b = next_state_w

            if render:
                env.render()

        if done_w:
            reward = reward_w * protagonist
        elif done_b:
            reward = reward_b * -protagonist
        else:
            raise ValueError

        print('reward={}'.format(reward))
        if num_disk_as_reward:
            total_disks = board_size**2
            if protagonist == 1:
                white_cnts = (total_disks + reward) / 2
                black_cnts = total_disks - white_cnts

                if white_cnts > black_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

            else:
                black_cnts = (total_disks + reward) / 2
                white_cnts = total_disks - black_cnts

                if black_cnts > white_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

        else:
            if reward == 1:
                win_cnts += 1
            elif reward == 0:
                draw_cnts += 1
            else:
                lose_cnts += 1
        print('-' * 3)
        print('#Wins: {}, #Draws: {}, #Loses: {}'.format(
            win_cnts, draw_cnts, lose_cnts))
    env.close()
Esempio n. 2
0
def test(
    protagonist,
    protagonist_agent_type,
    opponent_agent_type,
    board_size,
    num_rounds,
    protagonist_search_depth,
    opponent_search_depth,
    rand_seed,
    env_init_rand_steps,
    num_disk_as_reward=True,
    test_init_rand_steps=10,
    render=False,
    train_teacher=True,
    teacher_test_interval=2000,
    test_interval=10,
    num_test_games=200,
    save_interval=500,
    # load_path='data/selfplay/ent0_lr1e-5_35000.pth'):
    # load_path='/data/unagi0/omura/othello/selfplay/ent0_lr1e-5_35000.pth'):
    load_path='/data/unagi0/omura/othello/selfplay/ent0_lr1e-5_numstep64_45000.pth'
):
    # load_path = '/data/unagi0/omura/othello/teacher_student/testinterval10_ent0_lr5e-6_clip1e-1_numstep64_teacher_10000.pth'):

    args = get_args()
    args.algo = 'ppo'
    args.use_gae = True
    args.lr = 5e-6  #2.5e-4
    args.clip_param = 0.1
    args.value_loss_coef = 0.5  #0.5
    args.num_processes = 8
    args.num_steps = 64
    args.num_mini_batch = 4
    args.log_interval = 1
    args.use_linear_lr_decay = True
    args.entropy_coef = 0.0  # 0.01
    print(args)

    step_per_episode = 32
    # num_rounds_per_proc = num_rounds // args.num_processes
    num_updates = (num_rounds * step_per_episode) // args.num_steps

    # torch.manual_seed(args.seed)
    # torch.cuda.manual_seed_all(args.seed)
    #
    # if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
    #     torch.backends.cudnn.benchmark = False
    #     torch.backends.cudnn.deterministic = True
    #
    # log_dir = os.path.expanduser(args.log_dir)
    # eval_log_dir = log_dir + "_eval"
    # utils.cleanup_log_dir(log_dir)
    # utils.cleanup_log_dir(eval_log_dir)

    # torch.set_num_threads(1)
    # device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cpu")

    # agent_name = 'wo_ttrain_ent0_lr5e-6_clip1e-1_numstep64_3rd'
    # agent_name = 'testinterval1_ent0_lr5e-6_clip1e-1_numstep64_2nd'
    agent_name = 'testinterval10_ent0_lr5e-6_clip1e-1_numstep64_3rd'
    # agent_name = 'trained1_10k_wo_ttrain_ent0_lr5e-6_clip1e-1_numstep64'
    # agent_name = 'trained1_10k_testinterval10_ent0_lr5e-6_clip1e-1_numstep64'
    # agent_name = 'test'
    writer = SummaryWriter(
        log_dir="./log/ppo_teacher_vs_student/{}".format(agent_name))

    envs_list = []
    for i in range(args.num_processes):
        env = othello.SimpleOthelloEnv(board_size=board_size,
                                       seed=i,
                                       initial_rand_steps=env_init_rand_steps,
                                       num_disk_as_reward=num_disk_as_reward,
                                       render_in_step=render)
        env.rand_steps_holder = env_init_rand_steps
        env.test_rand_steps_holder = test_init_rand_steps
        envs_list.append(env)

    obs_space = spaces.Box(np.zeros((4, 8, 8)), np.ones((4, 8, 8)))
    action_space = spaces.Discrete(board_size**2)

    if load_path:
        actor_critic_teacher = torch.load(load_path)
    else:
        actor_critic_teacher = Policy(
            obs_space.shape,
            action_space,
            base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic_student = Policy(
        obs_space.shape,
        action_space,
        base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic_student.to(device)
    actor_critic_teacher.to(device)

    envs = PPOTeacherStudentEnvs(envs_list, othello_teacher_vs_student,
                                 actor_critic_teacher, actor_critic_student,
                                 device)
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent_teacher = algo.PPO(actor_critic_teacher,
                                 args.clip_param,
                                 args.ppo_epoch,
                                 args.num_mini_batch,
                                 args.value_loss_coef,
                                 args.entropy_coef,
                                 lr=args.lr,
                                 eps=args.eps,
                                 max_grad_norm=args.max_grad_norm)
        agent_student = algo.PPO(actor_critic_student,
                                 args.clip_param,
                                 args.ppo_epoch,
                                 args.num_mini_batch,
                                 args.value_loss_coef,
                                 args.entropy_coef,
                                 lr=args.lr,
                                 eps=args.eps,
                                 max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent_teacher = algo.A2C_ACKTR(actor_critic,
                                       args.value_loss_coef,
                                       args.entropy_coef,
                                       acktr=True)
        agent_student = algo.A2C_ACKTR(actor_critic,
                                       args.value_loss_coef,
                                       args.entropy_coef,
                                       acktr=True)

    rollouts_teacher = RolloutStorage(
        args.num_steps, args.num_processes, obs_space.shape, action_space,
        actor_critic_teacher.recurrent_hidden_state_size)
    rollouts_student = RolloutStorage(
        args.num_steps, args.num_processes, obs_space.shape, action_space,
        actor_critic_student.recurrent_hidden_state_size)

    # episode_rewards = deque(maxlen=10)
    update_t = 0
    update_s = 0
    win_avg = {'rand': 0, 'greedy': 0}
    last_win_avg = {'rand': 0, 'greedy': 0}

    obs_ts = [[0] * args.num_processes, [0] * args.num_processes]
    action_ts = [[0] * args.num_processes, [0] * args.num_processes]
    reward_ts = [[0] * args.num_processes, [0] * args.num_processes]
    done_ts = [[0] * args.num_processes, [0] * args.num_processes]
    infos_ts = [[0] * args.num_processes, [0] * args.num_processes]
    v_logprob_hidden_ts = [[0] * args.num_processes, [0] * args.num_processes]
    masks_ts = [[0] * args.num_processes, [0] * args.num_processes]
    bad_masks_ts = [[0] * args.num_processes, [0] * args.num_processes]
    choices_ts = [[0] * args.num_processes, [0] * args.num_processes]

    def save_to_buffer(who_, idx, obs_, action_, reward_, done_, infos_,
                       v_logprob_hidden_, masks_, bad_masks_, choices_):
        if who_ == 'teacher':
            ts = 0
        else:
            ts = 1
        obs_ts[ts][idx] = obs_[idx]
        action_ts[ts][idx] = action_[idx]
        reward_ts[ts][idx] = reward_[idx]
        done_ts[ts][idx] = done_[idx]
        infos_ts[ts][idx] = infos_[idx]
        v_logprob_hidden_ts[ts][idx] = v_logprob_hidden_[idx]
        masks_ts[ts][idx] = masks_[idx]
        bad_masks_ts[ts][idx] = bad_masks_[idx]
        choices_ts[ts][idx] = choices_[idx]

    student_buffer = {}
    teacher_buffer = {}
    for i in range(args.num_processes):
        student_buffer[i] = 0
        teacher_buffer[i] = 0

    teacher_step = 0
    student_step = 0
    for episode in range(num_rounds):
        print()
        print('Episode %s' % episode)
        teacher = random.choice([1, -1])
        envs.reset(teacher, win_avg, last_win_avg)
        over = False
        done_ts = [[0] * args.num_processes, [0] * args.num_processes]
        # teacher_step = 0
        # student_step = 0
        accum_reward_s = np.zeros(args.num_processes)
        accum_reward_t = np.zeros(args.num_processes)
        while not over:
            over = all(np.array(done_ts[0]) + np.array(done_ts[1]))

            # Observe reward and next obs
            # if not over:
            t_or_s, obs, action, reward, done, infos, v_logprob_hidden, masks, bad_masks = envs.step(
                rollouts_student.recurrent_hidden_states[student_step %
                                                         args.num_steps],
                rollouts_teacher.recurrent_hidden_states[teacher_step %
                                                         args.num_steps])

            # print('@', over, t_or_s, done, reward.squeeze())
            # print(action.squeeze())
            choices = [info['choices'] for info in infos]
            # for i in range(len(action)):
            #     assert done[i] or action[i][0] in choices[i], (action[i][0], choices[i])

            for i, who in enumerate(t_or_s):
                save_to_buffer(who, i, obs, action, reward, done, infos,
                               v_logprob_hidden, masks, bad_masks, choices)
                if who == 'teacher':
                    teacher_buffer[i] = 1
                else:
                    student_buffer[i] = 1

            # print(action_ts, choices_ts)

            if all(list(teacher_buffer.values())) or over:
                obs_t = torch.stack(obs_ts[0])
                action_t = torch.stack(action_ts[0])
                reward_t = torch.stack(reward_ts[0])
                # done_t = done_ts[0]
                # infos_t = infos_ts[0]
                v_logprob_hidden_t = torch.stack(v_logprob_hidden_ts[0])
                masks_t = torch.stack(masks_ts[0])
                bad_masks_t = torch.stack(bad_masks_ts[0])
                choices_t = copy.deepcopy(choices_ts[0])

                accum_reward_t = accum_reward_t + np.array(reward_t.squeeze())
                # print('t', accum_reward_t, np.array(reward_t.squeeze()))

                if teacher_step == 0:
                    rollouts_teacher.obs[0].copy_(obs_t)
                    rollouts_teacher.masks[0].copy_(masks_t)
                    rollouts_teacher.bad_masks[0].copy_(bad_masks_t)
                else:
                    rollouts_teacher.insert(obs_t, prev_hidden_t,
                                            prev_action_t, prev_logprob_t,
                                            prev_value_t, prev_reward_t,
                                            masks_t, bad_masks_t,
                                            prev_choices_t)
                prev_action_t = action_t
                prev_value_t = v_logprob_hidden_t[:, 0].unsqueeze(1)
                prev_logprob_t = v_logprob_hidden_t[:, 1].unsqueeze(1)
                prev_hidden_t = v_logprob_hidden_t[:, 2].unsqueeze(1)
                prev_reward_t = reward_t
                # prev_masks = masks
                # prev_bad_masks = bad_masks
                prev_choices_t = copy.deepcopy(choices_t)
                # over_t = all(done_t)
                teacher_step += 1
                for i in range(args.num_processes):
                    teacher_buffer[i] = 0

            if all(list(student_buffer.values())) or over:
                obs_s = torch.stack(obs_ts[1])
                action_s = torch.stack(action_ts[1])
                reward_s = torch.stack(reward_ts[1])
                # done_s = done_ts[1]
                # infos_s = infos_ts[1]
                v_logprob_hidden_s = torch.stack(v_logprob_hidden_ts[1])
                masks_s = torch.stack(masks_ts[1])
                bad_masks_s = torch.stack(bad_masks_ts[1])
                choices_s = copy.deepcopy(choices_ts[1])

                accum_reward_s = accum_reward_s + np.array(reward_s.squeeze())
                # print('s', accum_reward_s, np.array(reward_s.squeeze()))

                if student_step == 0:
                    rollouts_student.obs[0].copy_(obs_s)
                    rollouts_student.masks[0].copy_(masks_s)
                    rollouts_student.bad_masks[0].copy_(bad_masks_s)
                else:
                    rollouts_student.insert(obs_s, prev_hidden_s,
                                            prev_action_s, prev_logprob_s,
                                            prev_value_s, prev_reward_s,
                                            masks_s, bad_masks_s,
                                            prev_choices_s)
                prev_action_s = action_s
                prev_value_s = v_logprob_hidden_s[:, 0].unsqueeze(1)
                prev_logprob_s = v_logprob_hidden_s[:, 1].unsqueeze(1)
                prev_hidden_s = v_logprob_hidden_s[:, 2].unsqueeze(1)
                prev_reward_s = reward_s
                # prev_masks = masks
                # prev_bad_masks = bad_masks
                prev_choices_s = copy.deepcopy(choices_s)
                # over_s = all(done_s)
                student_step += 1
                for i in range(args.num_processes):
                    student_buffer[i] = 0

            if (teacher_step % args.num_steps == 0) and (teacher_step != 0):
                if train_teacher:
                    with torch.no_grad():
                        next_value_teacher = actor_critic_teacher.get_value(
                            rollouts_teacher.obs[-1],
                            rollouts_teacher.recurrent_hidden_states[-1],
                            rollouts_teacher.masks[-1]).detach()
                    rollouts_teacher.compute_returns(
                        next_value_teacher, args.use_gae, args.gamma,
                        args.gae_lambda, args.use_proper_time_limits)
                    value_loss_teacher, action_loss_teacher, dist_entropy_teacher = agent_teacher.update(
                        rollouts_teacher)
                    rollouts_teacher.after_update()
                    if args.use_linear_lr_decay:
                        utils.update_linear_schedule(
                            agent_teacher.optimizer, update_t, num_updates,
                            agent_teacher.optimizer.lr
                            if args.algo == "acktr" else args.lr)
                    update_t += 1
                # teacher_step = 0

            if (student_step % args.num_steps == 0) and (student_step != 0):
                with torch.no_grad():
                    next_value_student = actor_critic_student.get_value(
                        rollouts_student.obs[-1],
                        rollouts_student.recurrent_hidden_states[-1],
                        rollouts_student.masks[-1]).detach()
                rollouts_student.compute_returns(next_value_student,
                                                 args.use_gae, args.gamma,
                                                 args.gae_lambda,
                                                 args.use_proper_time_limits)
                value_loss_student, action_loss_student, dist_entropy_student = agent_student.update(
                    rollouts_student)
                rollouts_student.after_update()
                if args.use_linear_lr_decay:
                    utils.update_linear_schedule(
                        agent_student.optimizer, update_s, num_updates,
                        agent_student.optimizer.lr
                        if args.algo == "acktr" else args.lr)
                update_s += 1
                # student_step = 0

            if over:
                student_wins = 0
                print('reward')
                print(accum_reward_s)
                for r in accum_reward_s:
                    if r > 0:
                        student_wins += 1
                student_win_percent = student_wins / len(accum_reward_s)
            # over = all(done_ts[0]) and all(done_ts[1])
            # over = all(np.array(done_ts[0])+np.array(done_ts[1]))

        if episode % test_interval == 0:
            print('Test')
            games_rand, wins_rand = envs.test(
                'rand', num_test_games,
                rollouts_student.recurrent_hidden_states[0])
            writer.add_scalar("win avg({})".format('rand'),
                              wins_rand / games_rand, episode)
            print('### vs-random winning% {}/{}={}'.format(
                wins_rand, games_rand, wins_rand / games_rand))
            games_greedy, wins_greedy = envs.test(
                'greedy', num_test_games,
                rollouts_student.recurrent_hidden_states[0])
            writer.add_scalar("win avg({})".format('greedy'),
                              wins_greedy / games_greedy, episode)
            print('### vs-greedy winning% {}/{}={}'.format(
                wins_greedy, games_greedy, wins_greedy / games_greedy))
            last_win_avg = copy.deepcopy(win_avg)
            win_avg['rand'] = wins_rand / games_rand
            win_avg['greedy'] = wins_greedy / games_greedy

        if episode % teacher_test_interval == 0:
            print('Test teacher')
            games_rand, wins_rand = envs.test(
                'rand',
                num_test_games,
                rollouts_teacher.recurrent_hidden_states[0],
                teacher=True)
            writer.add_scalar("win avg teacher({})".format('rand'),
                              wins_rand / games_rand, episode)
            print('### vs-random winning% {}/{}={}'.format(
                wins_rand, games_rand, wins_rand / games_rand))
            games_greedy, wins_greedy = envs.test(
                'greedy',
                num_test_games,
                rollouts_teacher.recurrent_hidden_states[0],
                teacher=True)
            writer.add_scalar("win avg teacher({})".format('greedy'),
                              wins_greedy / games_greedy, episode)
            print('### vs-greedy winning% {}/{}={}'.format(
                wins_greedy, games_greedy, wins_greedy / games_greedy))
            last_win_avg = copy.deepcopy(win_avg)
            win_avg['rand'] = wins_rand / games_rand
            win_avg['greedy'] = wins_greedy / games_greedy

        if episode % save_interval == 0:
            if os.path.exists('/data/unagi0/omura'):
                t_save_path = '/data/unagi0/omura/othello/teacher_student/{}_teacher_{}.pth'.format(
                    agent_name, episode)
                s_save_path = '/data/unagi0/omura/othello/teacher_student/{}_student_{}.pth'.format(
                    agent_name, episode)
            else:
                t_save_path = 'data/selfplay/{}_teacher_{}.pth'.format(
                    agent_name, episode)
                s_save_path = 'data/selfplay/{}_student_{}.pth'.format(
                    agent_name, episode)
            torch.save(actor_critic_teacher, t_save_path)
            torch.save(actor_critic_student, s_save_path)

        if teacher_step > args.num_steps and student_step > args.num_steps:
            writer.add_scalar("value_loss_student", value_loss_student,
                              episode)
            writer.add_scalar("action_loss_student", action_loss_student,
                              episode)
            writer.add_scalar("dist_entropy_student", dist_entropy_student,
                              episode)
            writer.add_scalar("student_win_percent", student_win_percent,
                              episode)
            if train_teacher:
                writer.add_scalar("value_loss_teacher", value_loss_teacher,
                                  episode)
                writer.add_scalar("action_loss_teacher", action_loss_teacher,
                                  episode)
                writer.add_scalar("dist_entropy_teacher", dist_entropy_teacher,
                                  episode)
        # print(value_loss, action_loss, dist_entropy)

    envs.over()
Esempio n. 3
0
def test(
        protagonist,
        protagonist_agent_type,
        opponent_agent_type,
        board_size,
        num_rounds,
        protagonist_search_depth,
        opponent_search_depth,
        rand_seed,
        env_init_rand_steps,
        test_init_rand_steps=10,
        num_disk_as_reward=True,
        render=False,
        test_interval=500,
        num_test_games=200,
        save_interval=500,
        # load_path='data/selfplay/rainbow_selfplay_350000.pth'):
        load_path=''):

    args = get_args()
    args.algo = 'ppo'
    args.use_gae = True
    args.lr = 1e-5  #2.5e-4
    args.clip_param = 0.1
    args.value_loss_coef = 0.5  #0.5
    args.num_processes = 8
    args.num_steps = 64  #128
    args.num_mini_batch = 4
    args.log_interval = 1
    args.use_linear_lr_decay = True
    args.entropy_coef = 0  # 0.01
    print(args)

    step_per_episode = 32
    # num_rounds_per_proc = num_rounds // args.num_processes
    num_updates = (num_rounds * step_per_episode) // args.num_steps

    # torch.manual_seed(args.seed)
    # torch.cuda.manual_seed_all(args.seed)
    #
    # if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
    #     torch.backends.cudnn.benchmark = False
    #     torch.backends.cudnn.deterministic = True
    #
    # log_dir = os.path.expanduser(args.log_dir)
    # eval_log_dir = log_dir + "_eval"
    # utils.cleanup_log_dir(log_dir)
    # utils.cleanup_log_dir(eval_log_dir)

    # torch.set_num_threads(1)
    # device = torch.device("cuda:0" if args.cuda else "cpu")
    device = torch.device("cpu")

    # agent_name = 'ppo_selfplay_8proc_th1e-10_ent1e-2'
    # agent_name = 'ent0_lr1e-5_clip2e-1'
    agent_name = 'ent0_lr1e-5_numstep64'
    # agent_name = 'test'
    writer = SummaryWriter(log_dir="./log/ppo_selfplay/{}".format(agent_name))

    envs_list = []
    for i in range(args.num_processes):
        env = othello.SimpleOthelloEnv(board_size=board_size,
                                       seed=i,
                                       initial_rand_steps=env_init_rand_steps,
                                       num_disk_as_reward=num_disk_as_reward,
                                       render_in_step=render)
        env.rand_steps_holder = env_init_rand_steps
        env.test_rand_steps_holder = test_init_rand_steps
        envs_list.append(env)

    obs_space = spaces.Box(np.zeros((4, 8, 8)), np.ones((4, 8, 8)))
    action_space = spaces.Discrete(board_size**2)

    if load_path:
        actor_critic = torch.load(load_path)
    else:
        actor_critic = Policy(obs_space.shape,
                              action_space,
                              base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)

    envs = PPOEnvs(envs_list, subproc_worker, actor_critic, device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              obs_space.shape, action_space,
                              actor_critic.recurrent_hidden_state_size)

    # episode_rewards = deque(maxlen=10)
    u = 0
    step = 0
    for episode in range(num_rounds):
        print()
        print('Episode %s' % episode)
        envs.reset()
        over = False
        while not over:
            if args.use_linear_lr_decay:
                # decrease learning rate linearly
                utils.update_linear_schedule(
                    agent.optimizer, u, num_updates,
                    agent.optimizer.lr if args.algo == "acktr" else args.lr)
            # for step in range(args.num_steps):

            # Obser reward and next obs
            # if not over:
            obs, action, reward, done, infos, v_logprob_hidden, masks, bad_masks = envs.step(
                rollouts.recurrent_hidden_states[step % args.num_steps])
            choices = [info['choices'] for info in infos]
            for i in range(len(action)):
                assert done[i] or action[i][0] in choices[i], (action[i][0],
                                                               choices[i])
                # for info in infos:
                #     if 'episode' in info.keys():
                #         episode_rewards.append(info['episode']['r'])

            if step == 0:
                rollouts.obs[0].copy_(obs)
                rollouts.masks[0].copy_(masks)
                rollouts.bad_masks[0].copy_(bad_masks)
            else:
                rollouts.insert(obs, prev_hidden, prev_action, prev_logprob,
                                prev_value, prev_reward, masks, bad_masks,
                                prev_choices)
                # prev_obs = obs
            prev_action = action
            prev_value = v_logprob_hidden[:, 0].unsqueeze(1)
            prev_logprob = v_logprob_hidden[:, 1].unsqueeze(1)
            prev_hidden = v_logprob_hidden[:, 2].unsqueeze(1)
            prev_reward = reward
            # prev_masks = masks
            # prev_bad_masks = bad_masks
            prev_choices = choices
            over = all(done)

            if (step % args.num_steps == 0) and (step != 0):
                u += 1
                with torch.no_grad():
                    next_value = actor_critic.get_value(
                        rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                        rollouts.masks[-1]).detach()

                rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                         args.gae_lambda,
                                         args.use_proper_time_limits)

                value_loss, action_loss, dist_entropy = agent.update(rollouts)
                rollouts.after_update()
            step += 1

        if episode % test_interval == 0:
            games, wins = envs.test('rand', num_test_games,
                                    rollouts.recurrent_hidden_states[0])
            writer.add_scalar("win%({})".format('rand'), wins / games, episode)
            print('### vs-random winning% {}/{}={}'.format(
                wins, games, wins / games))
            games, wins = envs.test('greedy', num_test_games,
                                    rollouts.recurrent_hidden_states[0])
            writer.add_scalar("win%({})".format('greedy'), wins / games,
                              episode)
            print('### vs-greedy winning% {}/{}={}'.format(
                wins, games, wins / games))
        if episode % save_interval == 0:
            if os.path.exists('/data/unagi0/omura'):
                save_path = '/data/unagi0/omura/othello/selfplay/{}_{}.pth'.format(
                    agent_name, episode)
            else:
                save_path = 'data/selfplay/{}_{}.pth'.format(
                    agent_name, episode)
            torch.save(actor_critic, save_path)

        if step > args.num_steps:
            writer.add_scalar("value_loss", value_loss, episode)
            writer.add_scalar("action_loss", action_loss, episode)
            writer.add_scalar("dist_entropy", dist_entropy, episode)
            print(value_loss, action_loss, dist_entropy)

    envs.over()
Esempio n. 4
0
def play(
        protagonist,
        protagonist_agent_type='greedy',
        opponent_agent_type='rand',
        board_size=8,
        num_rounds=300000,
        protagonist_search_depth=1,
        opponent_search_depth=1,
        rand_seed=0,
        env_init_rand_steps=0,
        test_init_rand_steps=10,
        num_disk_as_reward=True,
        render=False,
        test_interval=2500,
        num_test_games=200,
        save_interval=5000,
        # load_path='data/selfplay/rainbow_selfplay_350000.pth'):
        load_path=''):
    print('protagonist: {}'.format(protagonist_agent_type))
    print('opponent: {}'.format(opponent_agent_type))

    agent_name = 'rainbow_selfplay_2nd'

    protagonist_policy = create_policy(policy_type=protagonist_agent_type,
                                       board_size=board_size,
                                       seed=rand_seed,
                                       search_depth=protagonist_search_depth,
                                       agent_name=agent_name)
    opponent_policy1 = create_policy(policy_type='rand',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policy2 = create_policy(policy_type='greedy',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policies = [('rand', opponent_policy1),
                         ('greedy', opponent_policy2)]

    # disable .run
    # def nop(*args):
    #     pass
    # opponent_policy.run = nop
    # if not hasattr(protagonist_policy, 'run'):
    #     protagonist_policy.run = nop

    # if opponent_agent_type == 'human':
    #     render_in_step = True
    # else:
    #     render_in_step = False

    if load_path:
        print('Load {} ...'.format(load_path))
        start_episode, loss = load(protagonist_policy, load_path)
    else:
        start_episode = 0

    env = othello.SimpleOthelloEnv(board_size=board_size,
                                   seed=rand_seed,
                                   initial_rand_steps=env_init_rand_steps,
                                   num_disk_as_reward=num_disk_as_reward,
                                   render_in_step=render)

    win_cnts = draw_cnts = lose_cnts = 0
    for i in range(start_episode, num_rounds):
        switch = np.random.randint(2)
        if switch:
            protagonist = protagonist * -1

        policy = {}
        if protagonist == -1:
            pcolor = 'black'
            policy['black'] = protagonist_policy
            policy['white'] = protagonist_policy
        else:
            pcolor = 'white'
            policy['black'] = protagonist_policy
            policy['white'] = protagonist_policy

        print('Episode {}'.format(i + 1))
        print('Protagonist is {}'.format(pcolor))

        obs_b = env.reset()
        state_b = make_state(obs_b, env)
        protagonist_policy.reset(env)
        # opponent_policy.reset(env)
        if render:
            env.render()
        done_b = done_w = False
        init = True
        while not (done_b or done_w):
            # black
            assert env.player_turn == -1
            # action_b = policy['black'].get_action(state_b)
            action_b = action('black', pcolor, state_b, policy)
            next_obs_b, reward_b, done_b, _ = env.step(action_b)
            next_state_b = make_state(next_obs_b, env)
            while (not done_b) and env.player_turn == -1:
                if pcolor == 'black':
                    policy['black'].run(state_b, action_b, reward_b, done_b,
                                        next_state_b)
                # action_b = policy['black'].get_action(next_state_b)
                action_b = action('black', pcolor, next_state_b, policy)
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)

            # learning black policy
            if not init:
                if pcolor == 'white':
                    policy['white'].run(state_w, action_w, -reward_b, done_b,
                                        next_state_b)
            init = False
            if done_b:
                if pcolor == 'black':
                    policy['black'].run(state_b, action_b, reward_b, done_b,
                                        next_state_b)
                break

            # white
            assert env.player_turn == 1
            state_w = next_state_b
            # action_w = policy['white'].get_action(state_w)
            action_w = action('white', pcolor, state_w, policy)
            next_obs_w, reward_w, done_w, _ = env.step(action_w)
            next_state_w = make_state(next_obs_w, env)
            while (not done_w) and env.player_turn == 1:
                if pcolor == 'white':
                    policy['white'].run(state_w, action_w, reward_w, done_w,
                                        next_state_w)
                # action_w = policy['white'].get_action(next_state_w)
                action_w = action('white', pcolor, next_state_w, policy)
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)

            # learning black policy
            if pcolor == 'black':
                policy['black'].run(state_b, action_b, -reward_w, done_w,
                                    next_state_w)
            if done_w:
                if pcolor == 'white':
                    policy['white'].run(state_w, action_w, reward_w, done_w,
                                        next_state_w)
                break

            state_b = next_state_w

            if render:
                env.render()

        if done_w:
            reward = reward_w * protagonist
        elif done_b:
            reward = reward_b * -protagonist
        else:
            raise ValueError

        print('reward={}'.format(reward))
        if num_disk_as_reward:
            total_disks = board_size**2
            if protagonist == 1:
                white_cnts = (total_disks + reward) / 2
                black_cnts = total_disks - white_cnts

                if white_cnts > black_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

            else:
                black_cnts = (total_disks + reward) / 2
                white_cnts = total_disks - black_cnts

                if black_cnts > white_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

        else:
            if reward == 1:
                win_cnts += 1
            elif reward == 0:
                draw_cnts += 1
            else:
                lose_cnts += 1
        print('-' * 3)
        print('#Wins: {}, #Draws: {}, #Loses: {}'.format(
            win_cnts, draw_cnts, lose_cnts))

        # calc student's winning %
        if i % test_interval == 0:
            env.initial_rand_steps = test_init_rand_steps
            for name, opponent_policy in opponent_policies:
                wins = 0
                protagonist = -1
                for j in range(num_test_games):
                    switch = np.random.randint(2)
                    if switch:
                        protagonist = protagonist * -1
                    policy = {}
                    if protagonist == -1:
                        pcolor = 'BLACK'
                        policy['black'] = protagonist_policy
                        policy['white'] = opponent_policy
                    else:
                        pcolor = 'WHITE'
                        policy['black'] = opponent_policy
                        policy['white'] = protagonist_policy

                    obs_b = env.reset()
                    state_b = make_state(obs_b, env)
                    protagonist_policy.reset(env)
                    opponent_policy.reset(env)
                    if render:
                        env.render()
                    done_b = done_w = False
                    while not (done_b or done_w):
                        # black
                        assert env.player_turn == -1
                        action_b = policy['black'].get_test_action(state_b)
                        next_obs_b, reward_b, done_b, _ = env.step(action_b)
                        next_state_b = make_state(next_obs_b, env)
                        while (not done_b) and env.player_turn == -1:
                            # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b)
                            action_b = policy['black'].get_test_action(
                                next_state_b)
                            next_obs_b, reward_b, done_b, _ = env.step(
                                action_b)
                            next_state_b = make_state(next_obs_b, env)
                        if done_b:
                            break

                        # white
                        assert env.player_turn == 1
                        state_w = next_state_b
                        action_w = policy['white'].get_test_action(state_w)
                        next_obs_w, reward_w, done_w, _ = env.step(action_w)
                        next_state_w = make_state(next_obs_w, env)
                        while (not done_w) and env.player_turn == 1:
                            # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w)
                            action_w = policy['white'].get_test_action(
                                next_state_w)
                            next_obs_w, reward_w, done_w, _ = env.step(
                                action_w)
                            next_state_w = make_state(next_obs_w, env)
                        if done_w:
                            break
                        state_b = next_state_w

                    if done_w:
                        reward = reward_w * protagonist
                    elif done_b:
                        reward = reward_b * -protagonist
                    else:
                        raise ValueError
                    if reward > 0:
                        wins += 1
                # last_win_per = win_per
                win_per = wins / num_test_games
                print()
                print('win % ({}):'.format(name), win_per)
                print()
                protagonist_policy.writer.add_scalar("win%({})".format(name),
                                                     win_per, i)
                env.initial_rand_steps = env_init_rand_steps

        if i % save_interval == 0:
            if os.path.exists('/data/unagi0/omura'):
                save_path = '/data/unagi0/omura/othello/selfplay/{}_{}.pth'.format(
                    agent_name, i)
            else:
                save_path = 'data/selfplay/{}_{}.pth'.format(agent_name, i)
            save(i, protagonist_policy, 0, save_path)
    env.close()
def play(
        teacher,
        teacher_agent_type='rainbow',
        student_agent_type='rainbow',
        opponent_agent_type='',
        board_size=8,
        num_rounds=400000,
        teacher_search_depth=1,
        student_search_depth=1,
        opponent_search_depth=1,
        rand_seed=0,
        env_init_rand_steps=0,
        test_init_rand_steps=10,
        num_disk_as_reward=True,
        render=False,
        train_teacher=False,
        test_interval=2500,
        num_test_games=200,
        teacher_train_steps=5000,
        save_interval=5000,
        # load_path='',
        # load_path='data/selfplay/rainbow_selfplay_350000.pth',
        # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_350000.pth',
        # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_2nd_65000.pth',
        load_path='/data/unagi0/omura/othello/teacher_student/rainbow_gre_rand_teacher_train_10interval_mp_59999.pth',
        num_process=1):
    print('teacher: {}'.format(teacher_agent_type))
    print('student: {}'.format(student_agent_type))
    print('opponent: {}'.format(opponent_agent_type))

    agent_name_teacher = 'rainbow_gre_rand_teacher_notrain_mp_load_teacher60k'
    agent_name_student = 'rainbow_gre_rand_student_notrain_mp_load_teacher60k'
    # agent_name_teacher = 'test'
    # agent_name_student = 'test'
    # load_path = ''

    teacher_policy = create_policy(policy_type=teacher_agent_type,
                                   board_size=board_size,
                                   seed=rand_seed,
                                   search_depth=teacher_search_depth,
                                   agent_name=agent_name_teacher)
    student_policy = create_policy(policy_type=student_agent_type,
                                   board_size=board_size,
                                   seed=rand_seed,
                                   search_depth=student_search_depth,
                                   agent_name=agent_name_student)

    opponent_policy1 = create_policy(policy_type='rand',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policy2 = create_policy(policy_type='greedy',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policies = [('rand', opponent_policy1),
                         ('greedy', opponent_policy2)]
    # opponent_policies = [('greedy', opponent_policy1)]

    if not train_teacher:

        def noop(*args):
            pass

        teacher_policy.run = noop
    # if not hasattr(protagonist_policy, 'run'):
    #     protagonist_policy.run = nop

    # if opponent_agent_type == 'human':
    #     render_in_step = True
    # else:
    #     render_in_step = False

    if load_path:
        print('Load {} ...'.format(load_path))
        start_episode, loss = load(teacher_policy, load_path)
    else:
        start_episode = 0

    env = othello.SimpleOthelloEnv(board_size=board_size,
                                   seed=rand_seed,
                                   initial_rand_steps=env_init_rand_steps,
                                   num_disk_as_reward=num_disk_as_reward,
                                   render_in_step=render)
    #
    # env_test = othello.OthelloEnv(
    #     board_size=board_size,
    #     seed=rand_seed,
    #     initial_rand_steps=env_init_rand_steps,
    #     num_disk_as_reward=num_disk_as_reward,
    #     render_in_step=render)

    win_cnts = draw_cnts = lose_cnts = 0
    win_per = {'rand': 0, 'greedy': 0}
    last_win_per = {'rand': 0, 'greedy': 0}
    teacher_queue = queue.Queue()
    # for i in range(start_episode, num_rounds):
    for i in range(num_rounds):
        switch = np.random.randint(2)
        if switch:
            teacher = teacher * -1

        policy = {}
        if teacher == -1:
            tcolor = 'black'
            policy['black'] = teacher_policy
            policy['white'] = student_policy
        else:
            tcolor = 'white'
            policy['black'] = student_policy
            policy['white'] = teacher_policy

        print('Episode {}'.format(i + 1))
        print('Teacher is {}'.format(tcolor))

        def run(color, state, action, reward, done, next_state):
            if color == tcolor:
                if done:
                    teacher_reward = 0
                    for k in win_per.keys():
                        teacher_reward += win_per[k] - last_win_per[k]
                else:
                    teacher_reward = 0
                if student_policy.is_learning():
                    # print('### learning')
                    teacher_queue.put(
                        (state, action, teacher_reward, done, next_state))

            else:
                policy[color].run(state, action, reward, done, next_state)

        obs_b = env.reset()
        state_b = make_state(obs_b, env)
        teacher_policy.reset(env)
        student_policy.reset(env)
        if render:
            env.render()
        done_b = done_w = False
        init = True

        # student_policy.win_queue = mp.Queue(num_process)

        while not (done_b or done_w):
            # black
            assert env.player_turn == -1
            action_b = policy['black'].get_action(state_b)
            next_obs_b, reward_b, done_b, _ = env.step(action_b)
            next_state_b = make_state(next_obs_b, env)
            while (not done_b) and env.player_turn == -1:
                # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b)
                run('black', state_b, action_b, reward_b, done_b, next_state_b)
                action_b = policy['black'].get_action(next_state_b)
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)

            # learning black policy
            if not init:
                # policy['white'].run(state_w, action_w, - reward_b, done_b, next_state_b)
                run('white', state_w, action_w, -reward_b, done_b,
                    next_state_b)
            init = False
            if done_b:
                # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b)
                run('black', state_b, action_b, reward_b, done_b, next_state_b)
                break

            # white
            assert env.player_turn == 1
            state_w = next_state_b
            action_w = policy['white'].get_action(state_w)
            next_obs_w, reward_w, done_w, _ = env.step(action_w)
            next_state_w = make_state(next_obs_w, env)
            while (not done_w) and env.player_turn == 1:
                # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w)
                run('white', state_w, action_w, reward_w, done_w, next_state_w)
                action_w = policy['white'].get_action(next_state_w)
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)

            # learning black policy
            # policy['black'].run(state_b, action_b, - reward_w, done_w, next_state_w)
            run('black', state_b, action_b, -reward_w, done_w, next_state_w)
            if done_w:
                # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w)
                run('white', state_w, action_w, reward_w, done_w, next_state_w)
                break

            state_b = next_state_w

            if render:
                env.render()

        if done_w:
            reward = reward_w * teacher
        elif done_b:
            reward = reward_b * -teacher
        else:
            raise ValueError

        print('reward={}'.format(reward))
        if num_disk_as_reward:
            total_disks = board_size**2
            if teacher == 1:
                white_cnts = (total_disks + reward) / 2
                black_cnts = total_disks - white_cnts

                if white_cnts > black_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

            else:
                black_cnts = (total_disks + reward) / 2
                white_cnts = total_disks - black_cnts

                if black_cnts > white_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

        else:
            if reward == 1:
                win_cnts += 1
            elif reward == 0:
                draw_cnts += 1
            else:
                lose_cnts += 1
        print('-' * 3)
        print('#Wins: {}, #Draws: {}, #Loses: {}'.format(
            win_cnts, draw_cnts, lose_cnts))

        if teacher_queue.qsize() >= teacher_train_steps:
            while not teacher_queue.empty():
                trans = teacher_queue.get()
                teacher_policy.run(*trans)

        # calc student's winning %
        args_student = (student_agent_type, board_size, rand_seed,
                        student_search_depth, 'calc_win_rate')

        state_dict = student_policy.network_state_dict()
        if i % test_interval == 0:
            env.initial_rand_steps = test_init_rand_steps
            for name, opponent_policy in opponent_policies:
                win_queue = mp.Queue(num_process)
                p_games = num_test_games // num_process
                total_games = p_games * num_process
                ps = []
                student_policies = []
                # for j in range(num_process):
                #     student_policies.append(copy.deepcopy(student_policy))

                for j in range(num_process):
                    ps.append(
                        mp.Process(target=calc_win,
                                   args=(env, p_games, args_student,
                                         state_dict, opponent_policy,
                                         win_queue)))
                for p in ps:
                    p.start()
                    # time.sleep(0.5)
                for p in ps:
                    p.join()

                # assert win_queue.qsize() == num_process

                total_wins = 0
                for _ in range(num_process):
                    total_wins += win_queue.get()

                last_win_per[name] = win_per[name]
                win_per[name] = total_wins / total_games
                student_policy.writer.add_scalar("win%({})".format(name),
                                                 win_per[name], i)
            print()
            print('last win%:', last_win_per)
            print('win%:', win_per)
            print()
            env.initial_rand_steps = env_init_rand_steps

        if (i + 1) % save_interval == 0:
            teacher_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format(
                agent_name_teacher, i + 1)
            student_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format(
                agent_name_student, i + 1)
            # teacher_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_teacher, i)
            # student_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_student, i)
            save(i, teacher_policy, 0, teacher_path)
            save(i, student_policy, 0, student_path)

    env.close()