action = estimator_1.predict(sess, [state])[0] else: action = estimator_2.predict(sess, [state])[0] if random_action_probability > random_action_probability_end: random_action_probability *= random_action_probability_decay next_state, reward, done, _ = env.step(action) replay_memory.add(state, action, reward, next_state, done) batch_s, batch_a, batch_r, batch_s1, batch_d = replay_memory.get_samples( batch_size) if batch_s.shape[0] == batch_size: if global_step % 2 == 0: estimator_1.update(sess, estimator_2, batch_s, batch_a, batch_r, batch_s1, batch_d) else: estimator_2.update(sess, estimator_1, batch_s, batch_a, batch_r, batch_s1, batch_d) global_step += 1 if done: recent_timesteps.append(t + 1) print("Episode {} finished after {} timesteps (average {})". format(i_episode, t + 1, np.mean(recent_timesteps))) break state = next_state
(state, action, reward, next_state, done)) else: error = estimator_2.td_errors(sess, estimator_1, [state], [action], [reward], [next_state])[0] replay_memory.add(error, (state, action, reward, next_state, done)) samples = replay_memory.sample(batch_size) indices_batch, samples_batch = map(np.array, zip(*samples)) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples_batch)) if global_step % 2 == 0: estimator_1.update(sess, estimator_2, states_batch, action_batch, reward_batch, next_states_batch, done_batch) errors = estimator_1.td_errors(sess, estimator_2, states_batch, action_batch, reward_batch, next_states_batch) for i in range(len(indices_batch)): replay_memory.update(indices_batch[i], errors[i]) else: estimator_2.update(sess, estimator_1, states_batch, action_batch, reward_batch, next_states_batch, done_batch) errors = estimator_2.td_errors(sess, estimator_1, states_batch, action_batch, reward_batch, next_states_batch) for i in range(len(indices_batch)): replay_memory.update(indices_batch[i], errors[i])
target_estimator.copy_model_from(sess, q_estimator) for t in range(500): env.render() action = None if np.random.rand(1) < random_action_probability: action = env.action_space.sample() else: action = q_estimator.predict(sess, [state])[0] if random_action_probability > random_action_probability_end: random_action_probability *= random_action_probability_decay next_state, reward, done, _ = env.step(action) replay_memory.add(state, action, reward, next_state, done) batch_s, batch_a, batch_r, batch_s1, batch_d = replay_memory.get_samples( batch_size) if batch_s.shape[0] == batch_size: q_estimator.update(sess, target_estimator, batch_s, batch_a, batch_r, batch_s1, batch_d) if done: print("Episode {} finished after {} timesteps".format( i_episode, t + 1)) break state = next_state
class Player: def __init__(self, step_size=0.1, epsilon=0.1, symbol=0): self.step_size = step_size self.epsilon = epsilon self.previous_state = State() self.state = None self.symbol = symbol self.td_errors = [] self.estimator = Estimator() self.policy = make_epsilon_greedy_policy(self.estimator) self.action = (0, 0) self.actions = [] for i in range(BOARD_ROWS): for j in range(BOARD_COLS): self.actions.append((i, j)) # Adiciona informação do novo estado def set_state(self, state): if self.state != None: self.previous_state.data = np.copy(self.state.data) self.state = state def set_symbol(self, symbol): self.symbol = symbol def set_epsilon(self, epsilon): self.epsilon = epsilon # Faz o update da estimação def backup(self, next_state, other=False): is_end = next_state.is_end() reward = 0 if is_end: if next_state.winner == self.symbol: reward = 1 elif next_state.winner == -self.symbol: reward = -1 else: reward = 0 if other: next_state.data = np.copy(self.state.data) self.state = self.previous_state # Update do TD q_values_next = self.estimator.predict(next_state) # Q-value para o TD Target if is_end: td_target = reward else: gamma = 1 td_target = reward + gamma * np.max(q_values_next) # Cálculo do TD error td = self.estimator.predict(self.state, self.action) td_error = np.abs(td_target - td) self.td_errors.append(td_error) # Atualiza o aproximador usando o td_target self.estimator.update(self.state, self.action, td_target) # Escolhe uma ação baseada no estado def act(self): action_probs = self.policy(self.state, self.epsilon) action_idx = np.random.choice(np.arange(len(self.actions)), p=action_probs) self.action = self.actions[action_idx] next_state = self.state.next_state(self.action[0], self.action[1], self.symbol) is_end = next_state.is_end() self.backup(next_state) return next_state, is_end def save_policy(self, epoch): with open( 'app/saves/policy_%s_%d.bin' % (('first' if self.symbol == 1 else 'second'), epoch), 'wb') as f: pickle.dump(self.estimator, f) path = 'app/saves/metrics_%s.csv' % ('first' if self.symbol == 1 else 'second') metrics_file = open(path, "a") with metrics_file: writer = csv.writer(metrics_file) for td_error in self.td_errors: writer.writerow([td_error]) self.td_errors.clear() def load_policy(self, epoch): with open( 'app/saves/policy_%s_%d.bin' % (('first' if self.symbol == 1 else 'second'), epoch), 'rb') as f: self.estimator = pickle.load(f) self.policy = make_epsilon_greedy_policy(self.estimator)
class AI: def __init__(self, load=None, filepath='best_estimator.h5', num_episodes=400, eval_episodes=20, update_freq=80, mcts_iters=100, tau_cutoff=20): self.num_episodes = num_episodes self.eval_episodes = eval_episodes self.update_freq = update_freq self.mcts_iters = mcts_iters self.tau_cutoff = tau_cutoff self.filepath = filepath to_load = load or filepath if os.path.isfile(to_load): self.estimator = Estimator(State.raw_shape, len(State.domain), filepath=to_load) else: self.estimator = Estimator(State.raw_shape, len(State.domain)) def duel(self, opponent, first=1): '''Play a full game against an opponent AI.''' if first == -1: e0, e1 = opponent, self.estimator else: e0, e1 = self.estimator, opponent s0 = MCTS(e0, maxiter=self.mcts_iters) s1 = MCTS(e1, maxiter=self.mcts_iters) while not s0.state.over: a = State.domain[np.argmax(s0.search())] s0.apply(a) s1.apply(a) if s0.state.over: break a = State.domain[np.argmax(s1.search())] s1.apply(a) s0.apply(a) return s0.state.winner def simulate(self, first=1): '''Simulate a full game by self-playing.''' mcts = MCTS(estimator=self.estimator, epsilon=0.25, maxiter=self.mcts_iters, first=first) history = [] tau = 1.0 while not mcts.state.over: if len(history) == self.tau_cutoff: tau = 0.1 policy = mcts.search(tau) history.append((mcts.state.raw, policy)) a = np.random.choice(State.domain, p=policy) mcts.apply(a) return history, mcts.state.winner def train(self): games = [] for i in range(self.num_episodes): history, winner = self.simulate(first=np.random.choice([-1, 1])) print("Game --> winner:", State.player_codes[winner], "moves:", len(history)) games.append((history, winner)) if i % self.update_freq + 1 == self.update_freq: print("Training new model...") new_estimator = self.estimator.update(games) score = 0 for j in range(self.eval_episodes): first = np.random.choice([-1, 1]) winner = self.duel(new_estimator, first=first) score -= first * winner print("New model score:", score) if score >= ceil(0.05 * self.eval_episodes): self.estimator = new_estimator self.estimator.save(self.filepath) print("New model selected.") else: print("New model rejected.") games = games[-5 * self.eval_episodes:] # truncate history