RA1 = policy_agent(1000, 0.01) RA2 = policy_agent(1000, 0.01) RA3 = random_agent() policy2_wins = 0 policy1_wins = 0 draws = 0 doomed = 0 exit = 1 win_ratio = 0 random_wins = 0 for i in range(1000000): wrong_move = 0 board = Board() start = np.random.uniform() > 0.5 first = False while (board.check_win() == 0 and np.any(board.board == 0)): episodes = 0 if first or start: while board.play_tac(*(RA1.get_move( board.get_feature_vec(board.tac)))) is False: pass if not np.any(board.board == 0): break episodes = 0 while board.play_tic(*( RA2.get_move(board.get_feature_vec(board.tic)))) is False: pass first = True RA2.update_params((board.check_win() - 0) * -200 * board.tic) RA1.update_params((board.check_win() + 0) * -200 * board.tac) if board.check_win() == board.tic:
RA1 = policy_agent(1000, 0.01) RA2 = policy_agent(1000, 0.01) RA3 = random_agent() policy2_wins = 0 policy1_wins = 0 draws = 0 doomed = 0 exit = 1 win_ratio = 0 random_wins = 0 for i in range(1000000): wrong_move = 0 board = Board() start = np.random.uniform() > 0.5 first = False while(board.check_win() == 0 and np.any(board.board == 0)): episodes = 0 if first or start: while board.play_tac(*(RA1.get_move( board.get_feature_vec(board.tac)))) is False: pass if not np.any(board.board == 0): break episodes = 0 while board.play_tic(*(RA2.get_move( board.get_feature_vec(board.tic)))) is False: pass first = True RA2.update_params((board.check_win() - 0) * -200 * board.tic) RA1.update_params((board.check_win() + 0) * -200 * board.tac) if board.check_win() == board.tic: