def train_ep_p2(net, policy, render=False): state = env.reset() ep_reward = 0 ctr = 0 if render: env.render() while True: # env.render() if ctr > 0: # skip player1's first turn so that he goes second action = choose_action('self', 1, state, net, 0.05) next_state, reward, done, info = env.step(action) if render: print('Player 1 moves', action) env.render() else: next_state = state reward = 0 done = False info = {'next_player': 2} # Let player 2 play p2_reward = 0 while info['next_player'] == 2 and not done: action2 = choose_action(policy, 2, next_state, net, 0.05) next_state, reward2, done, info = env.step(action2) if render: print('Player 2 moves', action2) env.render() p2_reward += reward2 if ctr == 0: ctr += 1 continue net.store_transition(state, action, reward - p2_reward, next_state) ep_reward += reward net.learn() if done: ctr = 0 break ctr += 1 state = next_state if ep_reward > 49: win = 1 else: win = 0 return ep_reward, win
def test_ep_pvp(net, net2, num_test, eps=0.05, render=False): test_reward = [] test_reward_p2 = [] test_win = 0 draw = 0 for i in range(num_test): state = env.reset() ep_reward = 0 p2_reward = 0 ctr = 0 if render: env.render() while True: # env.render() action = choose_action('self', 1, state, net, eps) next_state, reward, done, info = env.step(action) if render: print('Player 1 moves', action) env.render() while info['next_player'] == 2 and not done: action2 = choose_action('self', 2, next_state, net2, eps) next_state, reward2, done, info = env.step(action2) if render: print('Player 2 moves', action2) env.render() p2_reward += reward2 # state = next_state ep_reward += reward if done: break state = next_state ctr += 1 test_reward_p2.append(p2_reward) test_reward.append(ep_reward) if ep_reward > 49: test_win += 1 if ep_reward == 49: draw += 1 return np.mean(test_reward), np.mean( test_reward_p2), test_win / num_test, draw / num_test
def test_ep_p2(net, policy, num_test, eps=0.05, render=False): test_reward = [] test_win = 0 for i in range(num_test): state = env.reset() ep_reward = 0 ctr = 0 if render: env.render() while True: # env.render() if ctr > 0: # skip player1's first turn so that he goes second action = choose_action('self', 1, state, net, eps) next_state, reward, done, info = env.step(action) if render: print('Player 1 moves', action) env.render() else: next_state = state reward = 0 done = False info = {'next_player': 2} p2_reward = 0 while info['next_player'] == 2 and not done: action2 = choose_action(policy, 2, next_state, net, eps) next_state, reward2, done, info = env.step(action2) if render: print('Player 2 moves', action2) env.render() p2_reward += reward2 # state = next_state ep_reward += reward if done: break state = next_state ctr += 1 test_reward.append(ep_reward) if ep_reward > 49: test_win += 1 return np.mean(test_reward), test_win / num_test
def train_ep(net, policy, render=False): state = env.reset() ep_reward = 0 ctr = 0 if render: env.render() while True: # env.render() action = choose_action('self', 1, state, net, 0.05) next_state, reward, done, info = env.step(action) if render: print('Player 1 moves', action) env.render() # Let player 2 play p2_reward = 0 while info['next_player'] == 2 and not done: action2 = choose_action(policy, 2, next_state, net, 0.05) next_state, reward2, done, info = env.step(action2) if render: print('Player 2 moves', action2) env.render() p2_reward += reward2 net.store_transition(state, action, reward - p2_reward, next_state) ep_reward += reward net.learn() if done: ctr = 0 break ctr += 1 state = next_state if ep_reward > 49: win = 1 else: win = 0 return ep_reward, win