Beispiel #1
0
def rule_base_game(name, num_games, env, pipe):
    env.initial_rand_steps = env.test_rand_steps_holder
    dummy_outputs = (0, 0, 0)

    policy = create_policy(policy_type=name,
                           board_size=env.board_size,
                           seed=env.rand_seed)

    def get_action(color, p_color, state):
        if color == p_color:
            pipe.send((state, 0, 0, 0, {
                'type': 'need_action',
                'choices': env.possible_moves
            }, dummy_outputs))
            _, action, _ = pipe.recv()
        else:
            action = policy.get_test_action(state)
        return action

    num_wins = 0
    for _ in range(num_games):
        # reset
        obs_b = env.reset()
        policy.reset(env)
        state_b = make_state(obs_b, env)
        protagonist = np.random.randint(2)
        protagonist = -1 if protagonist == 0 else 1
        pcolor = 'black' if protagonist == -1 else 'white'
        done = False
        done_b = done_w = False
        init = True

        # game
        while not (done_b or done_w):
            assert env.player_turn == -1
            action_b = get_action(env.player_turn, protagonist, state_b)
            next_obs_b, reward_b, done_b, _ = env.step(action_b)
            next_state_b = make_state(next_obs_b, env)
            while (not done_b) and env.player_turn == -1:
                action_b = get_action(env.player_turn, protagonist,
                                      next_state_b)

                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)

            if done_b:
                break
            init = False

            # white
            assert env.player_turn == 1
            state_w = next_state_b
            action_w = get_action(env.player_turn, protagonist, state_w)
            next_obs_w, reward_w, done_w, _ = env.step(action_w)
            next_state_w = make_state(next_obs_w, env)
            while (not done_w) and env.player_turn == 1:
                action_w = get_action(env.player_turn, protagonist,
                                      next_state_w)
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)

            if done_w:
                break
            state_b = next_state_w

        if done_w:
            reward = reward_w * protagonist
        elif done_b:
            reward = reward_b * -protagonist
        else:
            raise ValueError
        if reward > 0:
            num_wins += 1
    env.initial_rand_steps = env.rand_steps_holder
    return num_wins
Beispiel #2
0
def othello_teacher_vs_student(id, env, pipe, parent_pipe):
    parent_pipe.close()
    o = np.zeros((4, env.board_size, env.board_size))
    dummy_outputs = (0, 0, 0)

    done = 0

    i = 0
    recv = True
    while True:
        i += 1
        if recv:
            cmd, a = pipe.recv()
            print('##', id, cmd, done)
        else:
            recv = True

        if cmd == 'over':
            break
        elif cmd == 'reset':
            obs_b = env.reset()
            state_b = make_state(obs_b, env)
            # protagonist = np.random.randint(2)
            teacher, win_avg, last_win_avg = a
            tcolor = 'black' if teacher == -1 else 'white'
            int_to_str = {-1: 'black', 1: 'white'}
            done = False
            done_b = done_w = False
            init = True

        elif cmd == 'step':

            def send_transition(color, state, action, reward, done_, choices,
                                output):
                if color == tcolor:
                    if done_:
                        teacher_reward = 0
                        for k in win_avg.keys():
                            teacher_reward += win_avg[k] - last_win_avg[k]
                    else:
                        teacher_reward = 0
                    # teacher_queue.put((state, action, teacher_reward, done, next_state))
                    pipe.send((state, action, teacher_reward, done_, {
                        'type':
                        '{}_transition'.format(int_to_str[teacher]),
                        'choices':
                        choices
                    }, output))
                    recv_ = pipe.recv()[0]

                else:
                    # policy[color].run(state, action, reward, done, next_state)
                    pipe.send((state, action, reward, done_, {
                        'type':
                        '{}_transition'.format(int_to_str[teacher * -1]),
                        'choices':
                        choices
                    }, output))
                    recv_ = pipe.recv()[0]
                return recv_

            if done:
                pipe.send((o, 0, 0, done, {
                    'type': 'over',
                    'choices': []
                }, dummy_outputs))
                cmd, a = pipe.recv()
                if cmd == 'reset':
                    recv = False
                elif cmd == 'over':
                    break
                continue

            while not (done_b or done_w):
                # black
                assert env.player_turn == -1
                pipe.send((state_b, 0, 0, 0, {
                    'type': 'need_black_action',
                    'choices': env.possible_moves
                }, dummy_outputs))
                cmd, action_b, output_b = pipe.recv()
                choice_b = env.possible_moves
                assert cmd == 'step'
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)
                while (not done_b) and env.player_turn == -1:
                    cmd = send_transition('black', state_b, action_b, reward_b,
                                          done_b, choice_b, output_b)
                    # pipe.send((state_b, action_b, reward_b, done_b, {'type': 'transition', 'choices': choice_b}, output_b))
                    # cmd = pipe.recv()[0]
                    assert cmd == 'step'
                    pipe.send((next_state_b, 0, 0, 0, {
                        'type': 'need_black_action',
                        'choices': env.possible_moves
                    }, dummy_outputs))
                    cmd, action_b, output_b = pipe.recv()
                    choice_b = env.possible_moves
                    assert cmd == 'step'
                    next_obs_b, reward_b, done_b, _ = env.step(action_b)
                    next_state_b = make_state(next_obs_b, env)

                if not init:
                    # if pcolor == 'white':
                    cmd = send_transition('white', state_w, action_w,
                                          -reward_b, done_b, choice_w,
                                          output_w)
                    # pipe.send((state_w, action_w, - reward_b, done_b, {'type': 'transition', 'choices': choice_w}, output_w))
                    # cmd = pipe.recv()[0]
                    assert cmd == 'step'

                if done_b:
                    cmd = send_transition('black', state_b, action_b, reward_b,
                                          done_b, choice_b, output_b)
                    # pipe.send((state_b, action_b, reward_b, done_b, {'type': 'transition', 'choices': env.possible_moves}, output_b))
                    # cmd = pipe.recv()[0]
                    assert cmd == 'step'
                    # if init:
                    # cmd = send_transition('white', o, 0, 0, done_b, [], dummy_outputs)
                    # pipe.send((o, 0, 0, done_b, {'type': 'transition', 'choices': env.possible_moves}, dummy_outputs))
                    # cmd = pipe.recv()[0]
                    # assert cmd == 'step'
                    break
                init = False

                # white
                assert env.player_turn == 1
                state_w = next_state_b
                pipe.send((state_w, 0, 0, 0, {
                    'type': 'need_white_action',
                    'choices': env.possible_moves
                }, dummy_outputs))
                cmd, action_w, output_w = pipe.recv()
                choice_w = env.possible_moves
                assert cmd == 'step'
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)
                while (not done_w) and env.player_turn == 1:
                    cmd = send_transition('white', state_w, action_w, reward_w,
                                          done_w, choice_w, output_w)
                    # pipe.send((state_w, action_w, reward_w, done_w, {'type': 'transition', 'choices': choice_w}, output_w))
                    # cmd = pipe.recv()[0]
                    assert cmd == 'step'
                    pipe.send((next_state_w, 0, 0, 0, {
                        'type': 'need_white_action',
                        'choices': env.possible_moves
                    }, dummy_outputs))
                    cmd, action_w, output_w = pipe.recv()
                    choice_w = env.possible_moves
                    assert cmd == 'step'
                    next_obs_w, reward_w, done_w, _ = env.step(action_w)
                    next_state_w = make_state(next_obs_w, env)

                # learning black policy
                cmd = send_transition('black', state_b, action_b, -reward_w,
                                      done_w, choice_b, output_b)
                # pipe.send((state_b, action_b, - reward_w, done_w, {'type': 'transition', 'choices': choice_b}, output_b))
                # cmd, pipe.recv()[0]
                assert cmd == 'step'
                if done_w:
                    cmd = send_transition('white', state_w, action_w, reward_w,
                                          done_w, choice_w, output_w)
                    # pipe.send((state_w, action_w, reward_w, done_w, {'type': None, 'choices': choice_w}, output_w))
                    # cmd = pipe.recv()[0]
                    assert cmd == 'step'
                    break
                state_b = next_state_w
            done = True

        elif cmd == 'test-rand':
            num_wins = rule_base_game('rand', a, env, pipe)
            while cmd != 'finish-test':
                pipe.send((o, 0, 0, 0, {
                    'type': 'over',
                    'choices': env.possible_moves,
                    'wins': num_wins
                }, dummy_outputs))
                cmd, _, _ = pipe.recv()
        elif cmd == 'test-greedy':
            num_wins = rule_base_game('greedy', a, env, pipe)
            while cmd != 'finish-test':
                pipe.send((o, 0, 0, 0, {
                    'type': 'over',
                    'choices': env.possible_moves,
                    'wins': num_wins
                }, dummy_outputs))
                cmd, _, _ = pipe.recv()
def play(
        protagonist,
        protagonist_agent_type='greedy',
        opponent_agent_type='rand',
        board_size=8,
        num_rounds=300000,
        protagonist_search_depth=1,
        opponent_search_depth=1,
        rand_seed=0,
        env_init_rand_steps=0,
        test_init_rand_steps=10,
        num_disk_as_reward=True,
        render=False,
        test_interval=2500,
        num_test_games=200,
        save_interval=5000,
        # load_path='data/selfplay/rainbow_selfplay_350000.pth'):
        load_path=''):
    print('protagonist: {}'.format(protagonist_agent_type))
    print('opponent: {}'.format(opponent_agent_type))

    agent_name = 'rainbow_selfplay_2nd'

    protagonist_policy = create_policy(policy_type=protagonist_agent_type,
                                       board_size=board_size,
                                       seed=rand_seed,
                                       search_depth=protagonist_search_depth,
                                       agent_name=agent_name)
    opponent_policy1 = create_policy(policy_type='rand',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policy2 = create_policy(policy_type='greedy',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policies = [('rand', opponent_policy1),
                         ('greedy', opponent_policy2)]

    # disable .run
    # def nop(*args):
    #     pass
    # opponent_policy.run = nop
    # if not hasattr(protagonist_policy, 'run'):
    #     protagonist_policy.run = nop

    # if opponent_agent_type == 'human':
    #     render_in_step = True
    # else:
    #     render_in_step = False

    if load_path:
        print('Load {} ...'.format(load_path))
        start_episode, loss = load(protagonist_policy, load_path)
    else:
        start_episode = 0

    env = othello.SimpleOthelloEnv(board_size=board_size,
                                   seed=rand_seed,
                                   initial_rand_steps=env_init_rand_steps,
                                   num_disk_as_reward=num_disk_as_reward,
                                   render_in_step=render)

    win_cnts = draw_cnts = lose_cnts = 0
    for i in range(start_episode, num_rounds):
        switch = np.random.randint(2)
        if switch:
            protagonist = protagonist * -1

        policy = {}
        if protagonist == -1:
            pcolor = 'black'
            policy['black'] = protagonist_policy
            policy['white'] = protagonist_policy
        else:
            pcolor = 'white'
            policy['black'] = protagonist_policy
            policy['white'] = protagonist_policy

        print('Episode {}'.format(i + 1))
        print('Protagonist is {}'.format(pcolor))

        obs_b = env.reset()
        state_b = make_state(obs_b, env)
        protagonist_policy.reset(env)
        # opponent_policy.reset(env)
        if render:
            env.render()
        done_b = done_w = False
        init = True
        while not (done_b or done_w):
            # black
            assert env.player_turn == -1
            # action_b = policy['black'].get_action(state_b)
            action_b = action('black', pcolor, state_b, policy)
            next_obs_b, reward_b, done_b, _ = env.step(action_b)
            next_state_b = make_state(next_obs_b, env)
            while (not done_b) and env.player_turn == -1:
                if pcolor == 'black':
                    policy['black'].run(state_b, action_b, reward_b, done_b,
                                        next_state_b)
                # action_b = policy['black'].get_action(next_state_b)
                action_b = action('black', pcolor, next_state_b, policy)
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)

            # learning black policy
            if not init:
                if pcolor == 'white':
                    policy['white'].run(state_w, action_w, -reward_b, done_b,
                                        next_state_b)
            init = False
            if done_b:
                if pcolor == 'black':
                    policy['black'].run(state_b, action_b, reward_b, done_b,
                                        next_state_b)
                break

            # white
            assert env.player_turn == 1
            state_w = next_state_b
            # action_w = policy['white'].get_action(state_w)
            action_w = action('white', pcolor, state_w, policy)
            next_obs_w, reward_w, done_w, _ = env.step(action_w)
            next_state_w = make_state(next_obs_w, env)
            while (not done_w) and env.player_turn == 1:
                if pcolor == 'white':
                    policy['white'].run(state_w, action_w, reward_w, done_w,
                                        next_state_w)
                # action_w = policy['white'].get_action(next_state_w)
                action_w = action('white', pcolor, next_state_w, policy)
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)

            # learning black policy
            if pcolor == 'black':
                policy['black'].run(state_b, action_b, -reward_w, done_w,
                                    next_state_w)
            if done_w:
                if pcolor == 'white':
                    policy['white'].run(state_w, action_w, reward_w, done_w,
                                        next_state_w)
                break

            state_b = next_state_w

            if render:
                env.render()

        if done_w:
            reward = reward_w * protagonist
        elif done_b:
            reward = reward_b * -protagonist
        else:
            raise ValueError

        print('reward={}'.format(reward))
        if num_disk_as_reward:
            total_disks = board_size**2
            if protagonist == 1:
                white_cnts = (total_disks + reward) / 2
                black_cnts = total_disks - white_cnts

                if white_cnts > black_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

            else:
                black_cnts = (total_disks + reward) / 2
                white_cnts = total_disks - black_cnts

                if black_cnts > white_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

        else:
            if reward == 1:
                win_cnts += 1
            elif reward == 0:
                draw_cnts += 1
            else:
                lose_cnts += 1
        print('-' * 3)
        print('#Wins: {}, #Draws: {}, #Loses: {}'.format(
            win_cnts, draw_cnts, lose_cnts))

        # calc student's winning %
        if i % test_interval == 0:
            env.initial_rand_steps = test_init_rand_steps
            for name, opponent_policy in opponent_policies:
                wins = 0
                protagonist = -1
                for j in range(num_test_games):
                    switch = np.random.randint(2)
                    if switch:
                        protagonist = protagonist * -1
                    policy = {}
                    if protagonist == -1:
                        pcolor = 'BLACK'
                        policy['black'] = protagonist_policy
                        policy['white'] = opponent_policy
                    else:
                        pcolor = 'WHITE'
                        policy['black'] = opponent_policy
                        policy['white'] = protagonist_policy

                    obs_b = env.reset()
                    state_b = make_state(obs_b, env)
                    protagonist_policy.reset(env)
                    opponent_policy.reset(env)
                    if render:
                        env.render()
                    done_b = done_w = False
                    while not (done_b or done_w):
                        # black
                        assert env.player_turn == -1
                        action_b = policy['black'].get_test_action(state_b)
                        next_obs_b, reward_b, done_b, _ = env.step(action_b)
                        next_state_b = make_state(next_obs_b, env)
                        while (not done_b) and env.player_turn == -1:
                            # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b)
                            action_b = policy['black'].get_test_action(
                                next_state_b)
                            next_obs_b, reward_b, done_b, _ = env.step(
                                action_b)
                            next_state_b = make_state(next_obs_b, env)
                        if done_b:
                            break

                        # white
                        assert env.player_turn == 1
                        state_w = next_state_b
                        action_w = policy['white'].get_test_action(state_w)
                        next_obs_w, reward_w, done_w, _ = env.step(action_w)
                        next_state_w = make_state(next_obs_w, env)
                        while (not done_w) and env.player_turn == 1:
                            # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w)
                            action_w = policy['white'].get_test_action(
                                next_state_w)
                            next_obs_w, reward_w, done_w, _ = env.step(
                                action_w)
                            next_state_w = make_state(next_obs_w, env)
                        if done_w:
                            break
                        state_b = next_state_w

                    if done_w:
                        reward = reward_w * protagonist
                    elif done_b:
                        reward = reward_b * -protagonist
                    else:
                        raise ValueError
                    if reward > 0:
                        wins += 1
                # last_win_per = win_per
                win_per = wins / num_test_games
                print()
                print('win % ({}):'.format(name), win_per)
                print()
                protagonist_policy.writer.add_scalar("win%({})".format(name),
                                                     win_per, i)
                env.initial_rand_steps = env_init_rand_steps

        if i % save_interval == 0:
            if os.path.exists('/data/unagi0/omura'):
                save_path = '/data/unagi0/omura/othello/selfplay/{}_{}.pth'.format(
                    agent_name, i)
            else:
                save_path = 'data/selfplay/{}_{}.pth'.format(agent_name, i)
            save(i, protagonist_policy, 0, save_path)
    env.close()
Beispiel #4
0
def subproc_worker(id, env, pipe, parent_pipe):
    parent_pipe.close()
    o = np.zeros((4, env.board_size, env.board_size))
    dummy_outputs = (0, 0, 0)

    done = 0

    i = 0
    recv = True
    while True:
        i += 1
        if recv:
            cmd, a = pipe.recv()
        else:
            recv = True

        if cmd == 'over':
            break
        elif cmd == 'reset':
            obs_b = env.reset()
            state_b = make_state(obs_b, env)
            protagonist = np.random.randint(2)
            protagonist = -1 if protagonist == 0 else 1
            pcolor = 'black' if protagonist == -1 else 'white'
            done = False
            done_b = done_w = False
            init = True

        elif cmd == 'step':
            if done:
                pipe.send((o, 0, 0, done, {
                    'type': 'over',
                    'choices': env.possible_moves
                }, dummy_outputs))
                cmd, a = pipe.recv()
                if cmd == 'reset':
                    recv = False
                elif cmd == 'over':
                    break
                continue

            while not (done_b or done_w):
                # black
                assert env.player_turn == -1
                pipe.send((state_b, 0, 0, 0, {
                    'type': 'need_action',
                    'choices': env.possible_moves
                }, dummy_outputs))
                cmd, action_b, output_b = pipe.recv()
                choice_b = env.possible_moves
                assert cmd == 'step'
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)
                while (not done_b) and env.player_turn == -1:
                    if pcolor == 'black':
                        pipe.send((state_b, action_b, reward_b, done_b, {
                            'type': None,
                            'choices': choice_b
                        }, output_b))
                        cmd = pipe.recv()[0]
                        assert cmd == 'step'
                    pipe.send((next_state_b, 0, 0, 0, {
                        'type': 'need_action',
                        'choices': env.possible_moves
                    }, dummy_outputs))
                    cmd, action_b, output_b = pipe.recv()
                    choice_b = env.possible_moves
                    assert cmd == 'step'
                    next_obs_b, reward_b, done_b, _ = env.step(action_b)
                    next_state_b = make_state(next_obs_b, env)

                # learning black policy
                if not init:
                    if pcolor == 'white':
                        pipe.send((state_w, action_w, -reward_b, done_b, {
                            'type': None,
                            'choices': choice_w
                        }, output_w))
                        cmd = pipe.recv()[0]
                        assert cmd == 'step'

                if done_b:
                    if pcolor == 'black':
                        pipe.send((state_b, action_b, reward_b, done_b, {
                            'type': None,
                            'choices': choice_b
                        }, output_b))
                        cmd = pipe.recv()[0]
                        assert cmd == 'step'
                    if pcolor == 'white' and init:
                        pipe.send((o, 0, 0, done_b, {
                            'type': None,
                            'choices': []
                        }, dummy_outputs))
                        cmd = pipe.recv()[0]
                        assert cmd == 'step'
                    break
                init = False

                # white
                assert env.player_turn == 1
                state_w = next_state_b
                pipe.send((state_w, 0, 0, 0, {
                    'type': 'need_action',
                    'choices': env.possible_moves
                }, dummy_outputs))
                cmd, action_w, output_w = pipe.recv()
                choice_w = env.possible_moves
                assert cmd == 'step'
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)
                while (not done_w) and env.player_turn == 1:
                    if pcolor == 'white':
                        pipe.send((state_w, action_w, reward_w, done_w, {
                            'type': None,
                            'choices': choice_w
                        }, output_w))
                        cmd = pipe.recv()[0]
                        assert cmd == 'step'
                    pipe.send((next_state_w, 0, 0, 0, {
                        'type': 'need_action',
                        'choices': env.possible_moves
                    }, dummy_outputs))
                    cmd, action_w, output_w = pipe.recv()
                    choice_w = env.possible_moves
                    assert cmd == 'step'
                    next_obs_w, reward_w, done_w, _ = env.step(action_w)
                    next_state_w = make_state(next_obs_w, env)

                # learning black policy
                if pcolor == 'black':
                    pipe.send((state_b, action_b, -reward_w, done_w, {
                        'type': None,
                        'choices': choice_b
                    }, output_b))
                    cmd, pipe.recv()[0]
                    assert cmd == 'step'
                if done_w:
                    if pcolor == 'white':
                        pipe.send((state_w, action_w, reward_w, done_w, {
                            'type': None,
                            'choices': choice_w
                        }, output_w))
                        cmd = pipe.recv()[0]
                        assert cmd == 'step'
                    break
                state_b = next_state_w
            done = True

        elif cmd == 'test-rand':
            num_wins = rule_base_game('rand', a, env, pipe)
            while cmd != 'finish-test':
                pipe.send((o, 0, 0, 0, {
                    'type': 'over',
                    'choices': env.possible_moves,
                    'wins': num_wins
                }, dummy_outputs))
                cmd, _, _ = pipe.recv()
        elif cmd == 'test-greedy':
            num_wins = rule_base_game('greedy', a, env, pipe)
            while cmd != 'finish-test':
                pipe.send((o, 0, 0, 0, {
                    'type': 'over',
                    'choices': env.possible_moves,
                    'wins': num_wins
                }, dummy_outputs))
                cmd, _, _ = pipe.recv()
def calc_win(env, num_test_games, args_policy, state_dict, opponent_policy,
             win_queue):
    env = copy.deepcopy(env)
    student_policy = create_policy(*args_policy)
    student_policy.load_state_dict(state_dict)
    wins = 0
    for j in range(num_test_games):
        student = np.random.randint(2)
        student = -1 if student == 0 else 1
        policy = {}
        if student == -1:
            policy['black'] = student_policy
            policy['white'] = opponent_policy
        else:
            policy['black'] = opponent_policy
            policy['white'] = student_policy

        obs_b = env.reset()
        state_b = make_state(obs_b, env)
        student_policy.reset(env)
        opponent_policy.reset(env)

        done_b = done_w = False
        while not (done_b or done_w):
            # black
            assert env.player_turn == -1
            action_b = policy['black'].get_test_action(state_b)
            next_obs_b, reward_b, done_b, _ = env.step(action_b)
            next_state_b = make_state(next_obs_b, env)
            while (not done_b) and env.player_turn == -1:
                # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b)
                action_b = policy['black'].get_test_action(next_state_b)
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)
            if done_b:
                break

            # white
            assert env.player_turn == 1
            state_w = next_state_b
            action_w = policy['white'].get_test_action(state_w)
            next_obs_w, reward_w, done_w, _ = env.step(action_w)
            next_state_w = make_state(next_obs_w, env)
            while (not done_w) and env.player_turn == 1:
                # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w)
                action_w = policy['white'].get_test_action(next_state_w)
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)
            if done_w:
                break
            state_b = next_state_w

        if done_w:
            reward = reward_w * student
        elif done_b:
            reward = reward_b * -student
        else:
            raise ValueError
        if reward > 0:
            wins += 1

    print('### games', num_test_games, 'wins', wins)
    win_queue.put(wins)
def play(
        teacher,
        teacher_agent_type='rainbow',
        student_agent_type='rainbow',
        opponent_agent_type='',
        board_size=8,
        num_rounds=400000,
        teacher_search_depth=1,
        student_search_depth=1,
        opponent_search_depth=1,
        rand_seed=0,
        env_init_rand_steps=0,
        test_init_rand_steps=10,
        num_disk_as_reward=True,
        render=False,
        train_teacher=False,
        test_interval=2500,
        num_test_games=200,
        teacher_train_steps=5000,
        save_interval=5000,
        # load_path='',
        # load_path='data/selfplay/rainbow_selfplay_350000.pth',
        # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_350000.pth',
        # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_2nd_65000.pth',
        load_path='/data/unagi0/omura/othello/teacher_student/rainbow_gre_rand_teacher_train_10interval_mp_59999.pth',
        num_process=1):
    print('teacher: {}'.format(teacher_agent_type))
    print('student: {}'.format(student_agent_type))
    print('opponent: {}'.format(opponent_agent_type))

    agent_name_teacher = 'rainbow_gre_rand_teacher_notrain_mp_load_teacher60k'
    agent_name_student = 'rainbow_gre_rand_student_notrain_mp_load_teacher60k'
    # agent_name_teacher = 'test'
    # agent_name_student = 'test'
    # load_path = ''

    teacher_policy = create_policy(policy_type=teacher_agent_type,
                                   board_size=board_size,
                                   seed=rand_seed,
                                   search_depth=teacher_search_depth,
                                   agent_name=agent_name_teacher)
    student_policy = create_policy(policy_type=student_agent_type,
                                   board_size=board_size,
                                   seed=rand_seed,
                                   search_depth=student_search_depth,
                                   agent_name=agent_name_student)

    opponent_policy1 = create_policy(policy_type='rand',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policy2 = create_policy(policy_type='greedy',
                                     board_size=board_size,
                                     seed=rand_seed,
                                     search_depth=opponent_search_depth)
    opponent_policies = [('rand', opponent_policy1),
                         ('greedy', opponent_policy2)]
    # opponent_policies = [('greedy', opponent_policy1)]

    if not train_teacher:

        def noop(*args):
            pass

        teacher_policy.run = noop
    # if not hasattr(protagonist_policy, 'run'):
    #     protagonist_policy.run = nop

    # if opponent_agent_type == 'human':
    #     render_in_step = True
    # else:
    #     render_in_step = False

    if load_path:
        print('Load {} ...'.format(load_path))
        start_episode, loss = load(teacher_policy, load_path)
    else:
        start_episode = 0

    env = othello.SimpleOthelloEnv(board_size=board_size,
                                   seed=rand_seed,
                                   initial_rand_steps=env_init_rand_steps,
                                   num_disk_as_reward=num_disk_as_reward,
                                   render_in_step=render)
    #
    # env_test = othello.OthelloEnv(
    #     board_size=board_size,
    #     seed=rand_seed,
    #     initial_rand_steps=env_init_rand_steps,
    #     num_disk_as_reward=num_disk_as_reward,
    #     render_in_step=render)

    win_cnts = draw_cnts = lose_cnts = 0
    win_per = {'rand': 0, 'greedy': 0}
    last_win_per = {'rand': 0, 'greedy': 0}
    teacher_queue = queue.Queue()
    # for i in range(start_episode, num_rounds):
    for i in range(num_rounds):
        switch = np.random.randint(2)
        if switch:
            teacher = teacher * -1

        policy = {}
        if teacher == -1:
            tcolor = 'black'
            policy['black'] = teacher_policy
            policy['white'] = student_policy
        else:
            tcolor = 'white'
            policy['black'] = student_policy
            policy['white'] = teacher_policy

        print('Episode {}'.format(i + 1))
        print('Teacher is {}'.format(tcolor))

        def run(color, state, action, reward, done, next_state):
            if color == tcolor:
                if done:
                    teacher_reward = 0
                    for k in win_per.keys():
                        teacher_reward += win_per[k] - last_win_per[k]
                else:
                    teacher_reward = 0
                if student_policy.is_learning():
                    # print('### learning')
                    teacher_queue.put(
                        (state, action, teacher_reward, done, next_state))

            else:
                policy[color].run(state, action, reward, done, next_state)

        obs_b = env.reset()
        state_b = make_state(obs_b, env)
        teacher_policy.reset(env)
        student_policy.reset(env)
        if render:
            env.render()
        done_b = done_w = False
        init = True

        # student_policy.win_queue = mp.Queue(num_process)

        while not (done_b or done_w):
            # black
            assert env.player_turn == -1
            action_b = policy['black'].get_action(state_b)
            next_obs_b, reward_b, done_b, _ = env.step(action_b)
            next_state_b = make_state(next_obs_b, env)
            while (not done_b) and env.player_turn == -1:
                # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b)
                run('black', state_b, action_b, reward_b, done_b, next_state_b)
                action_b = policy['black'].get_action(next_state_b)
                next_obs_b, reward_b, done_b, _ = env.step(action_b)
                next_state_b = make_state(next_obs_b, env)

            # learning black policy
            if not init:
                # policy['white'].run(state_w, action_w, - reward_b, done_b, next_state_b)
                run('white', state_w, action_w, -reward_b, done_b,
                    next_state_b)
            init = False
            if done_b:
                # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b)
                run('black', state_b, action_b, reward_b, done_b, next_state_b)
                break

            # white
            assert env.player_turn == 1
            state_w = next_state_b
            action_w = policy['white'].get_action(state_w)
            next_obs_w, reward_w, done_w, _ = env.step(action_w)
            next_state_w = make_state(next_obs_w, env)
            while (not done_w) and env.player_turn == 1:
                # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w)
                run('white', state_w, action_w, reward_w, done_w, next_state_w)
                action_w = policy['white'].get_action(next_state_w)
                next_obs_w, reward_w, done_w, _ = env.step(action_w)
                next_state_w = make_state(next_obs_w, env)

            # learning black policy
            # policy['black'].run(state_b, action_b, - reward_w, done_w, next_state_w)
            run('black', state_b, action_b, -reward_w, done_w, next_state_w)
            if done_w:
                # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w)
                run('white', state_w, action_w, reward_w, done_w, next_state_w)
                break

            state_b = next_state_w

            if render:
                env.render()

        if done_w:
            reward = reward_w * teacher
        elif done_b:
            reward = reward_b * -teacher
        else:
            raise ValueError

        print('reward={}'.format(reward))
        if num_disk_as_reward:
            total_disks = board_size**2
            if teacher == 1:
                white_cnts = (total_disks + reward) / 2
                black_cnts = total_disks - white_cnts

                if white_cnts > black_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

            else:
                black_cnts = (total_disks + reward) / 2
                white_cnts = total_disks - black_cnts

                if black_cnts > white_cnts:
                    win_cnts += 1
                elif white_cnts == black_cnts:
                    draw_cnts += 1
                else:
                    lose_cnts += 1

        else:
            if reward == 1:
                win_cnts += 1
            elif reward == 0:
                draw_cnts += 1
            else:
                lose_cnts += 1
        print('-' * 3)
        print('#Wins: {}, #Draws: {}, #Loses: {}'.format(
            win_cnts, draw_cnts, lose_cnts))

        if teacher_queue.qsize() >= teacher_train_steps:
            while not teacher_queue.empty():
                trans = teacher_queue.get()
                teacher_policy.run(*trans)

        # calc student's winning %
        args_student = (student_agent_type, board_size, rand_seed,
                        student_search_depth, 'calc_win_rate')

        state_dict = student_policy.network_state_dict()
        if i % test_interval == 0:
            env.initial_rand_steps = test_init_rand_steps
            for name, opponent_policy in opponent_policies:
                win_queue = mp.Queue(num_process)
                p_games = num_test_games // num_process
                total_games = p_games * num_process
                ps = []
                student_policies = []
                # for j in range(num_process):
                #     student_policies.append(copy.deepcopy(student_policy))

                for j in range(num_process):
                    ps.append(
                        mp.Process(target=calc_win,
                                   args=(env, p_games, args_student,
                                         state_dict, opponent_policy,
                                         win_queue)))
                for p in ps:
                    p.start()
                    # time.sleep(0.5)
                for p in ps:
                    p.join()

                # assert win_queue.qsize() == num_process

                total_wins = 0
                for _ in range(num_process):
                    total_wins += win_queue.get()

                last_win_per[name] = win_per[name]
                win_per[name] = total_wins / total_games
                student_policy.writer.add_scalar("win%({})".format(name),
                                                 win_per[name], i)
            print()
            print('last win%:', last_win_per)
            print('win%:', win_per)
            print()
            env.initial_rand_steps = env_init_rand_steps

        if (i + 1) % save_interval == 0:
            teacher_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format(
                agent_name_teacher, i + 1)
            student_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format(
                agent_name_student, i + 1)
            # teacher_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_teacher, i)
            # student_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_student, i)
            save(i, teacher_policy, 0, teacher_path)
            save(i, student_policy, 0, student_path)

    env.close()