action = estimator_1.predict(sess, [state])[0]
                else:
                    action = estimator_2.predict(sess, [state])[0]

            if random_action_probability > random_action_probability_end:
                random_action_probability *= random_action_probability_decay

            next_state, reward, done, _ = env.step(action)

            replay_memory.add(state, action, reward, next_state, done)

            batch_s, batch_a, batch_r, batch_s1, batch_d = replay_memory.get_samples(
                batch_size)
            if batch_s.shape[0] == batch_size:
                if global_step % 2 == 0:
                    estimator_1.update(sess, estimator_2, batch_s, batch_a,
                                       batch_r, batch_s1, batch_d)
                else:
                    estimator_2.update(sess, estimator_1, batch_s, batch_a,
                                       batch_r, batch_s1, batch_d)

            global_step += 1

            if done:
                recent_timesteps.append(t + 1)
                print("Episode {} finished after {} timesteps (average {})".
                      format(i_episode, t + 1, np.mean(recent_timesteps)))
                break

            state = next_state
Example #2
0
                                  (state, action, reward, next_state, done))
            else:
                error = estimator_2.td_errors(sess, estimator_1, [state],
                                              [action], [reward],
                                              [next_state])[0]
                replay_memory.add(error,
                                  (state, action, reward, next_state, done))

            samples = replay_memory.sample(batch_size)
            indices_batch, samples_batch = map(np.array, zip(*samples))
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples_batch))

            if global_step % 2 == 0:
                estimator_1.update(sess, estimator_2, states_batch,
                                   action_batch, reward_batch,
                                   next_states_batch, done_batch)
                errors = estimator_1.td_errors(sess, estimator_2, states_batch,
                                               action_batch, reward_batch,
                                               next_states_batch)
                for i in range(len(indices_batch)):
                    replay_memory.update(indices_batch[i], errors[i])
            else:
                estimator_2.update(sess, estimator_1, states_batch,
                                   action_batch, reward_batch,
                                   next_states_batch, done_batch)
                errors = estimator_2.td_errors(sess, estimator_1, states_batch,
                                               action_batch, reward_batch,
                                               next_states_batch)
                for i in range(len(indices_batch)):
                    replay_memory.update(indices_batch[i], errors[i])
            target_estimator.copy_model_from(sess, q_estimator)

        for t in range(500):
            env.render()

            action = None
            if np.random.rand(1) < random_action_probability:
                action = env.action_space.sample()
            else:
                action = q_estimator.predict(sess, [state])[0]

            if random_action_probability > random_action_probability_end:
                random_action_probability *= random_action_probability_decay

            next_state, reward, done, _ = env.step(action)

            replay_memory.add(state, action, reward, next_state, done)

            batch_s, batch_a, batch_r, batch_s1, batch_d = replay_memory.get_samples(
                batch_size)
            if batch_s.shape[0] == batch_size:
                q_estimator.update(sess, target_estimator, batch_s, batch_a,
                                   batch_r, batch_s1, batch_d)

            if done:
                print("Episode {} finished after {} timesteps".format(
                    i_episode, t + 1))
                break

            state = next_state
Example #4
0
class Player:
    def __init__(self, step_size=0.1, epsilon=0.1, symbol=0):
        self.step_size = step_size
        self.epsilon = epsilon
        self.previous_state = State()
        self.state = None
        self.symbol = symbol
        self.td_errors = []

        self.estimator = Estimator()
        self.policy = make_epsilon_greedy_policy(self.estimator)
        self.action = (0, 0)

        self.actions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.actions.append((i, j))

    # Adiciona informação do novo estado
    def set_state(self, state):
        if self.state != None:
            self.previous_state.data = np.copy(self.state.data)
        self.state = state

    def set_symbol(self, symbol):
        self.symbol = symbol

    def set_epsilon(self, epsilon):
        self.epsilon = epsilon

    # Faz o update da estimação
    def backup(self, next_state, other=False):
        is_end = next_state.is_end()
        reward = 0
        if is_end:
            if next_state.winner == self.symbol:
                reward = 1
            elif next_state.winner == -self.symbol:
                reward = -1
            else:
                reward = 0

        if other:
            next_state.data = np.copy(self.state.data)
            self.state = self.previous_state

        # Update do TD
        q_values_next = self.estimator.predict(next_state)

        # Q-value para o TD Target
        if is_end:
            td_target = reward
        else:
            gamma = 1
            td_target = reward + gamma * np.max(q_values_next)

        # Cálculo do TD error
        td = self.estimator.predict(self.state, self.action)
        td_error = np.abs(td_target - td)
        self.td_errors.append(td_error)

        # Atualiza o aproximador usando o td_target
        self.estimator.update(self.state, self.action, td_target)

    # Escolhe uma ação baseada no estado
    def act(self):
        action_probs = self.policy(self.state, self.epsilon)
        action_idx = np.random.choice(np.arange(len(self.actions)),
                                      p=action_probs)
        self.action = self.actions[action_idx]

        next_state = self.state.next_state(self.action[0], self.action[1],
                                           self.symbol)
        is_end = next_state.is_end()

        self.backup(next_state)

        return next_state, is_end

    def save_policy(self, epoch):
        with open(
                'app/saves/policy_%s_%d.bin' %
            (('first' if self.symbol == 1 else 'second'), epoch), 'wb') as f:
            pickle.dump(self.estimator, f)

        path = 'app/saves/metrics_%s.csv' % ('first'
                                             if self.symbol == 1 else 'second')
        metrics_file = open(path, "a")
        with metrics_file:
            writer = csv.writer(metrics_file)
            for td_error in self.td_errors:
                writer.writerow([td_error])

        self.td_errors.clear()

    def load_policy(self, epoch):
        with open(
                'app/saves/policy_%s_%d.bin' %
            (('first' if self.symbol == 1 else 'second'), epoch), 'rb') as f:
            self.estimator = pickle.load(f)
            self.policy = make_epsilon_greedy_policy(self.estimator)
Example #5
0
class AI:
    def __init__(self,
                 load=None,
                 filepath='best_estimator.h5',
                 num_episodes=400,
                 eval_episodes=20,
                 update_freq=80,
                 mcts_iters=100,
                 tau_cutoff=20):
        self.num_episodes = num_episodes
        self.eval_episodes = eval_episodes
        self.update_freq = update_freq
        self.mcts_iters = mcts_iters
        self.tau_cutoff = tau_cutoff
        self.filepath = filepath
        to_load = load or filepath
        if os.path.isfile(to_load):
            self.estimator = Estimator(State.raw_shape,
                                       len(State.domain),
                                       filepath=to_load)
        else:
            self.estimator = Estimator(State.raw_shape, len(State.domain))

    def duel(self, opponent, first=1):
        '''Play a full game against an opponent AI.'''

        if first == -1:
            e0, e1 = opponent, self.estimator
        else:
            e0, e1 = self.estimator, opponent

        s0 = MCTS(e0, maxiter=self.mcts_iters)
        s1 = MCTS(e1, maxiter=self.mcts_iters)

        while not s0.state.over:

            a = State.domain[np.argmax(s0.search())]

            s0.apply(a)
            s1.apply(a)

            if s0.state.over:
                break

            a = State.domain[np.argmax(s1.search())]

            s1.apply(a)
            s0.apply(a)

        return s0.state.winner

    def simulate(self, first=1):
        '''Simulate a full game by self-playing.'''

        mcts = MCTS(estimator=self.estimator,
                    epsilon=0.25,
                    maxiter=self.mcts_iters,
                    first=first)
        history = []
        tau = 1.0

        while not mcts.state.over:

            if len(history) == self.tau_cutoff:
                tau = 0.1

            policy = mcts.search(tau)
            history.append((mcts.state.raw, policy))

            a = np.random.choice(State.domain, p=policy)
            mcts.apply(a)

        return history, mcts.state.winner

    def train(self):

        games = []

        for i in range(self.num_episodes):

            history, winner = self.simulate(first=np.random.choice([-1, 1]))
            print("Game --> winner:", State.player_codes[winner], "moves:",
                  len(history))
            games.append((history, winner))

            if i % self.update_freq + 1 == self.update_freq:

                print("Training new model...")
                new_estimator = self.estimator.update(games)

                score = 0
                for j in range(self.eval_episodes):
                    first = np.random.choice([-1, 1])
                    winner = self.duel(new_estimator, first=first)
                    score -= first * winner

                print("New model score:", score)
                if score >= ceil(0.05 * self.eval_episodes):
                    self.estimator = new_estimator
                    self.estimator.save(self.filepath)
                    print("New model selected.")
                else:
                    print("New model rejected.")

                games = games[-5 * self.eval_episodes:]  # truncate history