def TD0(env, policy=None, alpha=0.01, gamma=1.0): """ Given a fixed, stochastic policy, estimate the Value function associated with that policy. Returns estimate of V_pi(s), for all s in S. """ if policy is None: policy = create_policy(env) v_pi = [0] * env.observation_space.n for e in range(N_EPISODES): obs = env.reset() for t in range(MAX_TS_PER_EPISODE): # env.render() prev_state = obs action = np.random.choice(list(policy[obs].keys()), p=list(policy[obs].values())) obs, reward, done, info = env.step(action) v_pi[prev_state] += alpha * (reward + gamma * v_pi[obs] - v_pi[prev_state]) if done: # print(f'Episode {e} finished after {t + 1} timesteps') # print(f'Visited: {visited}') # print(f'Memory: {memory}') break # env.render() env.close() return v_pi
def rule_base_game(name, num_games, env, pipe): env.initial_rand_steps = env.test_rand_steps_holder dummy_outputs = (0, 0, 0) policy = create_policy(policy_type=name, board_size=env.board_size, seed=env.rand_seed) def get_action(color, p_color, state): if color == p_color: pipe.send((state, 0, 0, 0, { 'type': 'need_action', 'choices': env.possible_moves }, dummy_outputs)) _, action, _ = pipe.recv() else: action = policy.get_test_action(state) return action num_wins = 0 for _ in range(num_games): # reset obs_b = env.reset() policy.reset(env) state_b = make_state(obs_b, env) protagonist = np.random.randint(2) protagonist = -1 if protagonist == 0 else 1 pcolor = 'black' if protagonist == -1 else 'white' done = False done_b = done_w = False init = True # game while not (done_b or done_w): assert env.player_turn == -1 action_b = get_action(env.player_turn, protagonist, state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: action_b = get_action(env.player_turn, protagonist, next_state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) if done_b: break init = False # white assert env.player_turn == 1 state_w = next_state_b action_w = get_action(env.player_turn, protagonist, state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: action_w = get_action(env.player_turn, protagonist, next_state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) if done_w: break state_b = next_state_w if done_w: reward = reward_w * protagonist elif done_b: reward = reward_b * -protagonist else: raise ValueError if reward > 0: num_wins += 1 env.initial_rand_steps = env.rand_steps_holder return num_wins
def play( protagonist, protagonist_agent_type='greedy', opponent_agent_type='rand', board_size=8, num_rounds=300000, protagonist_search_depth=1, opponent_search_depth=1, rand_seed=0, env_init_rand_steps=0, test_init_rand_steps=10, num_disk_as_reward=True, render=False, test_interval=2500, num_test_games=200, save_interval=5000, # load_path='data/selfplay/rainbow_selfplay_350000.pth'): load_path=''): print('protagonist: {}'.format(protagonist_agent_type)) print('opponent: {}'.format(opponent_agent_type)) agent_name = 'rainbow_selfplay_2nd' protagonist_policy = create_policy(policy_type=protagonist_agent_type, board_size=board_size, seed=rand_seed, search_depth=protagonist_search_depth, agent_name=agent_name) opponent_policy1 = create_policy(policy_type='rand', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policy2 = create_policy(policy_type='greedy', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policies = [('rand', opponent_policy1), ('greedy', opponent_policy2)] # disable .run # def nop(*args): # pass # opponent_policy.run = nop # if not hasattr(protagonist_policy, 'run'): # protagonist_policy.run = nop # if opponent_agent_type == 'human': # render_in_step = True # else: # render_in_step = False if load_path: print('Load {} ...'.format(load_path)) start_episode, loss = load(protagonist_policy, load_path) else: start_episode = 0 env = othello.SimpleOthelloEnv(board_size=board_size, seed=rand_seed, initial_rand_steps=env_init_rand_steps, num_disk_as_reward=num_disk_as_reward, render_in_step=render) win_cnts = draw_cnts = lose_cnts = 0 for i in range(start_episode, num_rounds): switch = np.random.randint(2) if switch: protagonist = protagonist * -1 policy = {} if protagonist == -1: pcolor = 'black' policy['black'] = protagonist_policy policy['white'] = protagonist_policy else: pcolor = 'white' policy['black'] = protagonist_policy policy['white'] = protagonist_policy print('Episode {}'.format(i + 1)) print('Protagonist is {}'.format(pcolor)) obs_b = env.reset() state_b = make_state(obs_b, env) protagonist_policy.reset(env) # opponent_policy.reset(env) if render: env.render() done_b = done_w = False init = True while not (done_b or done_w): # black assert env.player_turn == -1 # action_b = policy['black'].get_action(state_b) action_b = action('black', pcolor, state_b, policy) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: if pcolor == 'black': policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) # action_b = policy['black'].get_action(next_state_b) action_b = action('black', pcolor, next_state_b, policy) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) # learning black policy if not init: if pcolor == 'white': policy['white'].run(state_w, action_w, -reward_b, done_b, next_state_b) init = False if done_b: if pcolor == 'black': policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) break # white assert env.player_turn == 1 state_w = next_state_b # action_w = policy['white'].get_action(state_w) action_w = action('white', pcolor, state_w, policy) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: if pcolor == 'white': policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) # action_w = policy['white'].get_action(next_state_w) action_w = action('white', pcolor, next_state_w, policy) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) # learning black policy if pcolor == 'black': policy['black'].run(state_b, action_b, -reward_w, done_w, next_state_w) if done_w: if pcolor == 'white': policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) break state_b = next_state_w if render: env.render() if done_w: reward = reward_w * protagonist elif done_b: reward = reward_b * -protagonist else: raise ValueError print('reward={}'.format(reward)) if num_disk_as_reward: total_disks = board_size**2 if protagonist == 1: white_cnts = (total_disks + reward) / 2 black_cnts = total_disks - white_cnts if white_cnts > black_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: black_cnts = (total_disks + reward) / 2 white_cnts = total_disks - black_cnts if black_cnts > white_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: if reward == 1: win_cnts += 1 elif reward == 0: draw_cnts += 1 else: lose_cnts += 1 print('-' * 3) print('#Wins: {}, #Draws: {}, #Loses: {}'.format( win_cnts, draw_cnts, lose_cnts)) # calc student's winning % if i % test_interval == 0: env.initial_rand_steps = test_init_rand_steps for name, opponent_policy in opponent_policies: wins = 0 protagonist = -1 for j in range(num_test_games): switch = np.random.randint(2) if switch: protagonist = protagonist * -1 policy = {} if protagonist == -1: pcolor = 'BLACK' policy['black'] = protagonist_policy policy['white'] = opponent_policy else: pcolor = 'WHITE' policy['black'] = opponent_policy policy['white'] = protagonist_policy obs_b = env.reset() state_b = make_state(obs_b, env) protagonist_policy.reset(env) opponent_policy.reset(env) if render: env.render() done_b = done_w = False while not (done_b or done_w): # black assert env.player_turn == -1 action_b = policy['black'].get_test_action(state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) action_b = policy['black'].get_test_action( next_state_b) next_obs_b, reward_b, done_b, _ = env.step( action_b) next_state_b = make_state(next_obs_b, env) if done_b: break # white assert env.player_turn == 1 state_w = next_state_b action_w = policy['white'].get_test_action(state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) action_w = policy['white'].get_test_action( next_state_w) next_obs_w, reward_w, done_w, _ = env.step( action_w) next_state_w = make_state(next_obs_w, env) if done_w: break state_b = next_state_w if done_w: reward = reward_w * protagonist elif done_b: reward = reward_b * -protagonist else: raise ValueError if reward > 0: wins += 1 # last_win_per = win_per win_per = wins / num_test_games print() print('win % ({}):'.format(name), win_per) print() protagonist_policy.writer.add_scalar("win%({})".format(name), win_per, i) env.initial_rand_steps = env_init_rand_steps if i % save_interval == 0: if os.path.exists('/data/unagi0/omura'): save_path = '/data/unagi0/omura/othello/selfplay/{}_{}.pth'.format( agent_name, i) else: save_path = 'data/selfplay/{}_{}.pth'.format(agent_name, i) save(i, protagonist_policy, 0, save_path) env.close()
def calc_win(env, num_test_games, args_policy, state_dict, opponent_policy, win_queue): env = copy.deepcopy(env) student_policy = create_policy(*args_policy) student_policy.load_state_dict(state_dict) wins = 0 for j in range(num_test_games): student = np.random.randint(2) student = -1 if student == 0 else 1 policy = {} if student == -1: policy['black'] = student_policy policy['white'] = opponent_policy else: policy['black'] = opponent_policy policy['white'] = student_policy obs_b = env.reset() state_b = make_state(obs_b, env) student_policy.reset(env) opponent_policy.reset(env) done_b = done_w = False while not (done_b or done_w): # black assert env.player_turn == -1 action_b = policy['black'].get_test_action(state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) action_b = policy['black'].get_test_action(next_state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) if done_b: break # white assert env.player_turn == 1 state_w = next_state_b action_w = policy['white'].get_test_action(state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) action_w = policy['white'].get_test_action(next_state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) if done_w: break state_b = next_state_w if done_w: reward = reward_w * student elif done_b: reward = reward_b * -student else: raise ValueError if reward > 0: wins += 1 print('### games', num_test_games, 'wins', wins) win_queue.put(wins)
def play( teacher, teacher_agent_type='rainbow', student_agent_type='rainbow', opponent_agent_type='', board_size=8, num_rounds=400000, teacher_search_depth=1, student_search_depth=1, opponent_search_depth=1, rand_seed=0, env_init_rand_steps=0, test_init_rand_steps=10, num_disk_as_reward=True, render=False, train_teacher=False, test_interval=2500, num_test_games=200, teacher_train_steps=5000, save_interval=5000, # load_path='', # load_path='data/selfplay/rainbow_selfplay_350000.pth', # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_350000.pth', # load_path='/data/unagi0/omura/othello/selfplay/rainbow_selfplay_2nd_65000.pth', load_path='/data/unagi0/omura/othello/teacher_student/rainbow_gre_rand_teacher_train_10interval_mp_59999.pth', num_process=1): print('teacher: {}'.format(teacher_agent_type)) print('student: {}'.format(student_agent_type)) print('opponent: {}'.format(opponent_agent_type)) agent_name_teacher = 'rainbow_gre_rand_teacher_notrain_mp_load_teacher60k' agent_name_student = 'rainbow_gre_rand_student_notrain_mp_load_teacher60k' # agent_name_teacher = 'test' # agent_name_student = 'test' # load_path = '' teacher_policy = create_policy(policy_type=teacher_agent_type, board_size=board_size, seed=rand_seed, search_depth=teacher_search_depth, agent_name=agent_name_teacher) student_policy = create_policy(policy_type=student_agent_type, board_size=board_size, seed=rand_seed, search_depth=student_search_depth, agent_name=agent_name_student) opponent_policy1 = create_policy(policy_type='rand', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policy2 = create_policy(policy_type='greedy', board_size=board_size, seed=rand_seed, search_depth=opponent_search_depth) opponent_policies = [('rand', opponent_policy1), ('greedy', opponent_policy2)] # opponent_policies = [('greedy', opponent_policy1)] if not train_teacher: def noop(*args): pass teacher_policy.run = noop # if not hasattr(protagonist_policy, 'run'): # protagonist_policy.run = nop # if opponent_agent_type == 'human': # render_in_step = True # else: # render_in_step = False if load_path: print('Load {} ...'.format(load_path)) start_episode, loss = load(teacher_policy, load_path) else: start_episode = 0 env = othello.SimpleOthelloEnv(board_size=board_size, seed=rand_seed, initial_rand_steps=env_init_rand_steps, num_disk_as_reward=num_disk_as_reward, render_in_step=render) # # env_test = othello.OthelloEnv( # board_size=board_size, # seed=rand_seed, # initial_rand_steps=env_init_rand_steps, # num_disk_as_reward=num_disk_as_reward, # render_in_step=render) win_cnts = draw_cnts = lose_cnts = 0 win_per = {'rand': 0, 'greedy': 0} last_win_per = {'rand': 0, 'greedy': 0} teacher_queue = queue.Queue() # for i in range(start_episode, num_rounds): for i in range(num_rounds): switch = np.random.randint(2) if switch: teacher = teacher * -1 policy = {} if teacher == -1: tcolor = 'black' policy['black'] = teacher_policy policy['white'] = student_policy else: tcolor = 'white' policy['black'] = student_policy policy['white'] = teacher_policy print('Episode {}'.format(i + 1)) print('Teacher is {}'.format(tcolor)) def run(color, state, action, reward, done, next_state): if color == tcolor: if done: teacher_reward = 0 for k in win_per.keys(): teacher_reward += win_per[k] - last_win_per[k] else: teacher_reward = 0 if student_policy.is_learning(): # print('### learning') teacher_queue.put( (state, action, teacher_reward, done, next_state)) else: policy[color].run(state, action, reward, done, next_state) obs_b = env.reset() state_b = make_state(obs_b, env) teacher_policy.reset(env) student_policy.reset(env) if render: env.render() done_b = done_w = False init = True # student_policy.win_queue = mp.Queue(num_process) while not (done_b or done_w): # black assert env.player_turn == -1 action_b = policy['black'].get_action(state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) while (not done_b) and env.player_turn == -1: # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) run('black', state_b, action_b, reward_b, done_b, next_state_b) action_b = policy['black'].get_action(next_state_b) next_obs_b, reward_b, done_b, _ = env.step(action_b) next_state_b = make_state(next_obs_b, env) # learning black policy if not init: # policy['white'].run(state_w, action_w, - reward_b, done_b, next_state_b) run('white', state_w, action_w, -reward_b, done_b, next_state_b) init = False if done_b: # policy['black'].run(state_b, action_b, reward_b, done_b, next_state_b) run('black', state_b, action_b, reward_b, done_b, next_state_b) break # white assert env.player_turn == 1 state_w = next_state_b action_w = policy['white'].get_action(state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) while (not done_w) and env.player_turn == 1: # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) run('white', state_w, action_w, reward_w, done_w, next_state_w) action_w = policy['white'].get_action(next_state_w) next_obs_w, reward_w, done_w, _ = env.step(action_w) next_state_w = make_state(next_obs_w, env) # learning black policy # policy['black'].run(state_b, action_b, - reward_w, done_w, next_state_w) run('black', state_b, action_b, -reward_w, done_w, next_state_w) if done_w: # policy['white'].run(state_w, action_w, reward_w, done_w, next_state_w) run('white', state_w, action_w, reward_w, done_w, next_state_w) break state_b = next_state_w if render: env.render() if done_w: reward = reward_w * teacher elif done_b: reward = reward_b * -teacher else: raise ValueError print('reward={}'.format(reward)) if num_disk_as_reward: total_disks = board_size**2 if teacher == 1: white_cnts = (total_disks + reward) / 2 black_cnts = total_disks - white_cnts if white_cnts > black_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: black_cnts = (total_disks + reward) / 2 white_cnts = total_disks - black_cnts if black_cnts > white_cnts: win_cnts += 1 elif white_cnts == black_cnts: draw_cnts += 1 else: lose_cnts += 1 else: if reward == 1: win_cnts += 1 elif reward == 0: draw_cnts += 1 else: lose_cnts += 1 print('-' * 3) print('#Wins: {}, #Draws: {}, #Loses: {}'.format( win_cnts, draw_cnts, lose_cnts)) if teacher_queue.qsize() >= teacher_train_steps: while not teacher_queue.empty(): trans = teacher_queue.get() teacher_policy.run(*trans) # calc student's winning % args_student = (student_agent_type, board_size, rand_seed, student_search_depth, 'calc_win_rate') state_dict = student_policy.network_state_dict() if i % test_interval == 0: env.initial_rand_steps = test_init_rand_steps for name, opponent_policy in opponent_policies: win_queue = mp.Queue(num_process) p_games = num_test_games // num_process total_games = p_games * num_process ps = [] student_policies = [] # for j in range(num_process): # student_policies.append(copy.deepcopy(student_policy)) for j in range(num_process): ps.append( mp.Process(target=calc_win, args=(env, p_games, args_student, state_dict, opponent_policy, win_queue))) for p in ps: p.start() # time.sleep(0.5) for p in ps: p.join() # assert win_queue.qsize() == num_process total_wins = 0 for _ in range(num_process): total_wins += win_queue.get() last_win_per[name] = win_per[name] win_per[name] = total_wins / total_games student_policy.writer.add_scalar("win%({})".format(name), win_per[name], i) print() print('last win%:', last_win_per) print('win%:', win_per) print() env.initial_rand_steps = env_init_rand_steps if (i + 1) % save_interval == 0: teacher_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format( agent_name_teacher, i + 1) student_path = '/data/unagi0/omura/othello/teacher_student/{}_{}.pth'.format( agent_name_student, i + 1) # teacher_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_teacher, i) # student_path = 'data/teacher_student/{}_{}.pth'.format(agent_name_student, i) save(i, teacher_policy, 0, teacher_path) save(i, student_policy, 0, student_path) env.close()