restore = False input_size = 9 output_size = 9 max_batch_size = 1000000 learning_rate = 0.001 discount_factor = 0.9 epsilon = 0.1 train_episode = 1000 verify_episode = 10000 env = tictactoe() agent = AIagent_RL(learning_rate=learning_rate, restore=restore) agent_base = AIagent_Base() def update(agent, batch, dis=discount_factor): x_stack = np.empty(0).reshape(0, input_size) y_stack = np.empty(0).reshape(0, output_size) for state, action, reward, next_state, next_turn, done in batch: action_value = agent.action_value.predict(state) action_value = np.reshape(action_value, 9) if not done: next_action = agent.policy(next_state, next_turn, epsilon=0) next_action_value = agent.action_value.predict(next_state)
import copy from Tictactoe_Env import tictactoe from Agent import AIagent_RL, AIagent_Base, Human_agent env = tictactoe() agent1 = AIagent_RL(restore=True) agent2 = Human_agent() def play(): done = 0 winner = 0 env.reset() state = copy.copy(env.state) i = 0 while not done: i += 1 turn = copy.copy(env.turn) if i % 2 == 1: action = agent1.policy(state, turn, epsilon=0) else: action = agent2.policy(state, turn, epsilon=0) next_state, done, reward, winner = env.step(action) state = copy.copy(next_state) env.render() if winner == 0: print("Draw!") else: print("Winner is agent %d!" % winner)
import matplotlib.pyplot as plt import copy import itertools from Tictactoe_Env import tictactoe, predict, ret_turn from Agent import AIagent_RL, AIagent_Base from Functions import is_finished, available_actions verify_episode = 100 discount_factor = 0.9 env = tictactoe() agent = AIagent_RL(restore=False) agent_base = AIagent_Base() iteration_plt = [] v_plt = [] wr_plt = [] def policy_evaluation(agent): theta = 1e-9 while True: delta = 0.0 state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) done, winner = is_finished(state) if not done: # except for terminal state v = agent.value(state)
import copy from Tictactoe_Env import tictactoe from Agent import AIagent_RL, AIagent_Base, Human_agent env = tictactoe() agent1 = Human_agent() agent2 = AIagent_RL(restore=True) def play(): done = 0 winner = 0 env.reset() state = copy.copy(env.state) i = 0 while not done: i += 1 turn = copy.copy(env.turn) if i % 2 == 1: action = agent1.policy(state, turn, epsilon=0) else: action = agent2.policy(state, turn, epsilon=0) next_state, done, reward, winner = env.step(action) state = copy.copy(next_state) env.render() if winner == 0: print("Draw!") else: print("Winner is agent %d!" % winner)