def play(): done = 0 env.reset() state = copy.copy(env.state) i = 0 while not done: i += 1 turn = copy.copy(env.turn) if i % 2 == 1: action = agent1.policy(state, turn, available_actions(state), epsilon=0) else: action = agent2.policy(state, turn, available_actions(state), epsilon=0) next_state, done, winner = env.step(action) # update AI agent update(agent2, state, next_state) state = copy.copy(next_state) env.render() if winner == 0: print("Draw!") else: print("Winner is agent %d!" % winner) # save data agent2.save()
def policy(self, state, turn, epsilon=0.08): maxvalue = -999999 minvalue = 999999 available = available_actions(state) action_list = [] if np.random.rand(1) < epsilon: action_list = available else: value = self.action_value.predict(state) value = np.reshape(value, 9) if turn == 1: for action in available: if value[action] > maxvalue: action_list = [] maxvalue = value[action] action_list.append(action) elif value[action] == maxvalue: action_list.append(action) else: for action in available: if value[action] < minvalue: action_list = [] minvalue = value[action] action_list.append(action) elif value[action] == minvalue: action_list.append(action) return random.choice(action_list)
def policy(self, state, turn, epsilon=0.08): maxvalue = -99999 minvalue = 99999 encoded_state = encode(state) available = available_actions(state) action_list = [] if np.random.rand(1) < epsilon: action_list = available else: if turn == 1: for action in available: encoded = encoded_state + str(action) if self.action_value[encoded] > maxvalue: action_list = [] maxvalue = self.action_value[encoded] action_list.append(action) elif self.action_value[encoded] == maxvalue: action_list.append(action) else: for action in available: encoded = encoded_state + str(action) if self.action_value[encoded] < minvalue: action_list = [] minvalue = self.action_value[encoded] action_list.append(action) elif self.action_value[encoded] == minvalue: action_list.append(action) return random.choice(action_list)
def policy(self, state, turn, epsilon=0): available = available_actions(state) while True: ret = int(input("input [0 1 2 / 3 4 5 / 6 7 8] : ")) if ret in available: break return ret
def init_value(self): state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) encoded_state = encode(state) available = available_actions(state) for action in available: encoded = encoded_state + str(action) self.action_value[encoded] = 0
def init_value(self): state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) encoded = encode(state) done, winner = is_finished(state) if not done: self._value[encoded] = random.uniform(-0.5, 0.5) self._policy[encoded] = random.choice(available_actions(state)) # terminal state value else: self._value[encoded] = 0
def policy(self, state, turn, epsilon=0): available = available_actions(state) action_list = [] for i in available: state[i] = turn done, winner = is_finished(state) state[i] = 0 if done: action_list.append(i) if len(action_list) == 0: action_list = available return random.choice(action_list)
def polcy_improvement(agent): policy_stable = True state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) done, winner = is_finished(state) if not done: # except for terminal state available = available_actions(state) turn = ret_turn(state) old_action = agent.policy(state) max_value = -9999999 min_value = 9999999 if turn == 1: for action in available: next_state, reward = predict(state, action) value = reward + discount_factor * agent.value(next_state) if value > max_value: max_value = value new_action = action else: for action in available: next_state, reward = predict(state, action) value = reward + discount_factor * agent.value(next_state) if value < min_value: min_value = value new_action = action agent.assign_policy(state, new_action) if old_action != new_action: policy_stable = False if policy_stable: return True else: return False
def train(): win_rate_list = [] win_rate_mean = [] episode = 0 while True: # episode < total_episode: # epsilon = random.choice(epsilon_list) # training stage1 (self-training) for _ in range(train_episode1): episode += 1 done = 0 env.reset() state = copy.copy(env.state) while not done: turn = copy.copy(env.turn) action = agent.policy(state, turn, available_actions(state), epsilon=epsilon) next_state, done, winner = env.step(action) update(agent, state, next_state, learning_rate=learning_rate) state = copy.copy(next_state) # training stage2 (vs agent_base) for i in range(train_episode2): episode += 1 done = 0 env.reset() state = copy.copy(env.state) j = 0 while not done: j += 1 turn = copy.copy(env.turn) if (i + j) % 2 == 1: action = agent.policy(state, turn, available_actions(state), epsilon=epsilon) else: action = agent_base.policy(state, turn, available_actions(state)) next_state, done, winner = env.step(action) if done: update(agent, state, next_state, learning_rate=learning_rate) state = copy.copy(next_state) # verification stage win = lose = draw = 0 for i in range(verify_episode): done = 0 env.reset() state = copy.copy(env.state) j = 0 while not done: j += 1 turn = copy.copy(env.turn) if (i + j) % 2 == 1: # epsilon 0 action = agent.policy(state, turn, available_actions(state), epsilon=0) else: action = agent_base.policy(state, turn, available_actions(state)) next_state, done, winner = env.step(action) state = copy.copy(next_state) if winner == 0: draw += 1 elif (i + j) % 2 == 1: win += 1 else: lose += 1 win_rate = (win + draw) / verify_episode print("[Episode %d] Win : %d Draw : %d Lose : %d Win_rate: %.2f" % (episode, win, draw, lose, win_rate)) agent.save() if win_rate > 0.97: break # print status (each train_episode * 100) win_rate_list.append(win_rate) if episode % ((train_episode1 + train_episode2) * 100) == 0: mean = np.mean(win_rate_list) win_rate_mean.append(np.round(mean, 2)) win_rate_list.clear() print("[ ", end='') for x in win_rate_mean: print("%.2f" % x, end=' ') print("]")