def deep_q_learning_step(epsilon, player): global loss_for_one_episode index = epsilon_greedy(epsilon, player) q_value = (model(torch.FloatTensor(game.board))[(player + 2) % 3])[index] a_p, reward = game.step(index, player) if abs(a_p) == 10 or game.full_board(): loss = ((reward - q_value)**2) else: while a_p != player and abs(a_p) != 10 and not game.full_board(): index = epsilon_greedy(agr, a_p) a_p, _ = game.step(index, a_p) if abs(a_p) == 10: loss = ((reward - 17 - q_value)**2) elif game.full_board(): loss = ((reward - 5 - q_value)**2) else: q_value_max = (model(torch.FloatTensor(game.board) * player)[(a_p + 2) % 3]).max() loss = ((reward + GAMMA * q_value_max - q_value)**2) optimizer.zero_grad() loss.backward() optimizer.step() loss_for_one_episode = loss_for_one_episode + loss return a_p
def play_with(): game.new_game() player = 1 print(game.board) while abs(player) != 10 and not game.full_board(): index = epsilon_greedy(0.0, player) player, _ = game.step(index, player) print(game.board) if not (abs(player) != 10 and not game.full_board()): continue my_index = -1 + int(input("index: ")) player, _ = game.step(my_index, player) print(game.board)
def one_episode(epsilon, player): game.new_game() global loss_for_one_episode, loss_for_sever_episodes loss_for_one_episode = 0 if player == 1: while abs(player) != 10 and not game.full_board(): player = deep_q_learning_step(epsilon, player) else: index = epsilon_greedy(0.0, 1) player, _ = game.step(index, player) while abs(player) != 10 and not game.full_board(): player = deep_q_learning_step(epsilon, player) print(loss_for_one_episode) loss_for_sever_episodes += loss_for_one_episode
def test(): game.new_game() player = 1 print(game.board) while abs(player) != 10 and not game.full_board(): index = epsilon_greedy(0.0, player) player, _ = game.step(index, player) print(game.board)