Exemple #1
0
    def __init__(self, agents, envs):
        """

        """
        self.root = Tk()
        self.agents = agents
        self.room = TheRoom()
        self.envs = envs
        self.state = ()
        self.stopped = True
Exemple #2
0
    def training(self, verbose=False):
        """
        Training both model and options
        :return:
        """
        self.q_table.clear()
        self.q_table_o1.clear()
        self.q_table_o2.clear()
        self.q_table_o3.clear()
        self.q_table_o4.clear()
        self.q_table_o5.clear()
        self.q_table_o6.clear()
        self.q_table_o7.clear()
        self.q_table_o8.clear()

        self.env = TheRoom((1, 1), (7, 9))

        episodes = 0
        explore = self.EPSILON

        model_conv_steps = 0
        min_steps = 9999
        steps_episodes = []

        while 1:

            steps = self.play_episode(explore, options_trained=False)
            episodes += 1

            model_conv_steps += steps
            steps_episodes.append(steps)

            # min tracer
            if steps < min_steps:
                min_steps = steps

            if episodes % self.UPDATE == 0:

                if explore < self.DECAY:
                    explore = self.MIN_EPSILON
                else:
                    explore = explore - self.DECAY

            if min_steps <= CONVERGENCE:

                if verbose:
                    print(
                        "\nModel converged on {} episodes, after executing {} steps. "
                        "Best result was {} steps\n".format(
                            episodes, model_conv_steps, min_steps))
                break

        return episodes, model_conv_steps, steps_episodes
    def __init__(self):
        """
        Agent is the entity that interacts with the environment
        """
        self.EPSILON = 0.2
        self.ALPHA = 1
        self.GAMMA = 0.99

        self.UPDATE = 1000000000
        self.DECAY = 0.02
        self.MIN_EPSILON = 0.02

        self.env = TheRoom(initial_state=(1, 1), objective=(10, 9))

        self.state = self.env.reset()
        self.q_table = collections.defaultdict(
            float)  # general Q-table, with 0 as default value
Exemple #4
0
    def training_options(self, verbose=False):
        """
        Train options separated from the main objective
        :return:
        """

        total_steps = 0

        op_starting = {
            12: (1, 1),
            13: (1, 1),
            21: self.env.hall_2_4,
            24: self.env.hall_1_2,
            31: self.env.hall_3_4,
            34: self.env.hall_1_3,
            42: self.env.hall_3_4,
            43: self.env.hall_2_4
        }

        self.q_table_o1.clear()
        self.q_table_o2.clear()
        self.q_table_o3.clear()
        self.q_table_o4.clear()
        self.q_table_o5.clear()
        self.q_table_o6.clear()
        self.q_table_o7.clear()
        self.q_table_o8.clear()

        for op in self.options_space:

            self.env = TheRoom(op_starting[op], self.options_B[op])

            total_reward, op_steps, _ = self.play_option(
                op, explore=self.EPSILON_OPT)

            if verbose is True:
                print("Option {} converged on {} steps".format(op, op_steps))

            total_steps += op_steps

        return total_steps
class AgentQ:
    def __init__(self):
        """
        Agent is the entity that interacts with the environment
        """
        self.EPSILON = 0.2
        self.ALPHA = 1
        self.GAMMA = 0.99

        self.UPDATE = 1000000000
        self.DECAY = 0.02
        self.MIN_EPSILON = 0.02

        self.env = TheRoom(initial_state=(1, 1), objective=(10, 9))

        self.state = self.env.reset()
        self.q_table = collections.defaultdict(
            float)  # general Q-table, with 0 as default value

    def sample_env(self):
        """
        Takes a random action and execute it on the current state
        :return: current/old state, action taken, reward received, new state accessed by (s,a), conclusion of the
        environment
        """
        action = self.env.sample_action()
        old_state = self.state

        new_state, reward, is_done = self.env.step(action)

        # if the objective was reached, reset the environment
        self.state = self.env.reset() if is_done else new_state

        return old_state, action, reward, new_state, is_done

    def best_value_and_action(self, state):
        """
        Get form the Q-table the best action and its respective value
        :param state: state to search for best action
        :return:
        """

        best_value, best_action = None, None

        # check every action on the selected state
        for action in range(self.env.primordial_actions_space_n):

            # q-table has 0 as default
            action_value = self.q_table[(state, action)]

            if best_value is None or best_value < action_value:

                best_value = action_value
                best_action = action

            # elif best_value == action_value:
            #     best_value = random.choices([best_value, action_value])[0]

        return best_value, best_action

    def get_action(self, state, explore=0.0):
        """
        Chooses an action, respecting the relation Exploit x Explore and using the q-table
        :param explore:
        :param state:
        :return:
        """

        action_space = list.copy(self.env.primordial_actions_space)

        # if the sum of the possible actions for a given state is 0, it indicates that the agent never iterated
        # through here. In this case, we select the next action randomly, independent of the explore x exploit
        # reasoning. It must be done because, if we don't select at random, the agent will always take the first
        # action on the list, skewing the result in the direction of this action.
        aux = sum([self.q_table[state, act] for act in action_space])
        if aux == 0:
            action = random.choices(action_space)[0]

        else:

            _, action = self.best_value_and_action(state)

            # Explore
            if random.random() < explore:

                action_space.remove(action)
                action = random.choices(action_space)[0]

        return action

    def value_update(self, s, a, r, next_s):
        """
        update q-table according to the formula
        Q(s,a) = (1 - ALPHA) * oldQ(s,a) + ALPHA * (r + GAMMA * newQ(s,a))

        where old denotes the value before the update

        :param s: current state
        :param a: action taken
        :param r: reward received
        :param next_s:  new state accessed
        :return:
        """
        best_v, _ = self.best_value_and_action(next_s)

        old_val = self.q_table[(s, a)]
        new_val = r + self.GAMMA * best_v

        self.q_table[(s,
                      a)] = (1 - self.ALPHA) * old_val + self.ALPHA * new_val

    def play_episode(self, explore=0.0):
        """
        Play an entire episode on the environment, i.e., agent navigates on it until the objective is reached
        :param explore: probability of taking a random action instead of best action (explore x exploit)
        :return: total steps taken until the objective was reached
        """

        # number of steps taken until the objective
        total_steps = 0

        total_reward = 0.0

        state = self.env.reset()

        while True:

            action = self.get_action(state, explore)

            # interaction with env
            new_state, reward, is_done = self.env.step(action)

            # if a new state was reached, update q-table
            # a new state isn't reach only when the movement directs to a wall
            if state != new_state:
                self.value_update(state, action, reward, new_state)

            total_reward += reward
            total_steps += 1

            if is_done:
                break

            state = new_state

        return total_reward, total_steps

    def play_step(self, state):
        """
        Execute an action on the chosen state. Action selection follows the q-table (there is no explore option)
        :param state: state where the action must be taken
        :return:
        """

        _, action = self.best_value_and_action(state)
        new_state, reward, is_done = self.env.step(action)

        return new_state, is_done

    def training(self, verbose=False):
        """
        Execute several episodes, until the mean of the last 10 reaches a desired threshold
        :return:
        """

        explore = self.EPSILON
        episode = 0
        total_steps = 0
        min_steps = 9999999
        steps_episode = []

        self.q_table.clear()

        while 1:

            episode += 1

            _, steps = self.play_episode(explore)

            if verbose:
                print("{}".format(steps), end=" ")

            total_steps = total_steps + steps

            steps_episode.append(steps)

            # min tracer
            if steps < min_steps:
                min_steps = steps

            if episode % self.UPDATE == 0:

                if explore < self.DECAY:
                    explore = self.MIN_EPSILON
                else:
                    explore = explore - self.DECAY

            if min_steps <= CONVERGENCE:
                """"""
                if verbose:

                    if verbose:
                        print(
                            "\nModel converged on {} episodes, afeter executing {} interactions. "
                            "Best result was {}\n".format(
                                episode, total_steps, min_steps))

                break

        return episode, total_steps, steps_episode

    def random_hyper_parameter_tuning(self):
        """
        :return:
        """

        episodes = []
        interactions = []

        epsilons = [0.2]
        alphas = [1, 0.2]
        gammas = [0.99, 0.9]

        # epsilons = [0.1, 0.2, 0.3]
        # alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 1]
        # gammas = [0.9, 0.95, 0.99]

        trained = []

        best_steps = 9999999
        lower_std = 999999
        best_hyper = 0

        train_test = 0
        while 1:

            if len(trained) >= (len(epsilons) * len(alphas) * len(gammas)):
                break

            else:

                self.EPSILON = random.choices(epsilons)[0]
                self.ALPHA = random.choices(alphas)[0]
                self.GAMMA = random.choices(gammas)[0]

                train = (self.EPSILON, self.ALPHA, self.GAMMA)

                if train in trained:
                    continue
                else:
                    trained.append(train)

                episodes.clear()
                interactions.clear()

                print("Training {}, with {}".format(train_test, train))

                for _ in range(TRAINING):
                    ep, inter, _ = self.training()

                    episodes.append(ep)
                    interactions.append(inter)

                self.remove_outliers(interactions, OUTLIER)
                mean_int = np.mean(interactions)
                std_int = np.std(interactions)

                print("Avg: {0:.2f}, std {1:.2f}.".format(mean_int, std_int))

                if mean_int < best_steps and std_int < lower_std:

                    best_steps = mean_int
                    lower_std = std_int
                    best_hyper = train
                    print("new avg min: {0:.2f}, std {1:.2f}. Parameters {2}".
                          format(best_steps, lower_std, best_hyper))

                train_test += 1

        print("Best avg steps: {0:.2f}, std {1:.2f}. Best parameters {2}.".
              format(best_steps, lower_std, best_hyper))

    def remove_outliers(self, data, percent):
        """

        :param percent:
        :return:
        """
        if percent == 0:
            return

        if percent > 100:
            percent = 100

        percent = percent / 2
        percent = percent / 100

        remove_n = int(len(data) * percent)

        for _ in range(0, remove_n):
            data.remove(max(data))

        for _ in range(0, remove_n):
            data.remove(min(data))

    def count_outlier_3sigma(self, data, limit):

        mean = np.mean(data)
        std = np.std(data)

        up_outliers = [x for x in data if (x > (mean + limit * std))]

        down_outliers = [x for x in data if (x < (mean - limit * std))]

        return len(up_outliers) + len(down_outliers)

    def export_csv(self, steps_episode):

        with open("./q-learning.csv", "w", newline='') as csvfile:

            writer = csv.writer(csvfile)

            columns = [("Episodes"), ("Steps")]
            writer.writerow(columns)

            episodes = 1
            for step in steps_episode:

                formatted = [(episodes), step]
                writer.writerow(formatted)

                episodes += 1
Exemple #6
0
class Gui:
    def __init__(self, agents, envs):
        """

        """
        self.root = Tk()
        self.agents = agents
        self.room = TheRoom()
        self.envs = envs
        self.state = ()
        self.stopped = True

    def build_window(self):
        """

        :return:
        """

        main_frame = Frame(self.root, bg='white')
        main_frame.pack()

        model = self.room.env_model()
        width = len(model[0])
        height = len(model[1])

        self.box_agents = ttk.Combobox(
            main_frame,
            values=("Q-Learning", "Q-learning with options",
                    "Q-learning with options and actions"),
            width=28)
        self.box_agents.grid(row=0, column=0)
        self.box_agents.current(0)

        self.restart = Button(main_frame,
                              text='Start',
                              command=self.restart_anim)
        self.restart.grid(row=0, column=1)

        self.canv = Canvas(main_frame,
                           width=width * lngt,
                           height=height * lngt,
                           bg='gray')
        self.canv.grid(row=1, column=0, columnspan=2)

        for h in range(height):
            y = h * lngt
            for w in range(width):
                x = w * lngt

                if model[h][w] == 1:
                    color = 'purple'
                elif model[h][w] == 0:
                    color = 'red'
                else:
                    color = 'grey'

                self.canv.create_rectangle(x,
                                           y,
                                           x + lngt,
                                           y + lngt,
                                           fill=color,
                                           tag="state({},{})".format(h, w))

        self.thread_agent = threading.Thread(target=self.anim_agent)
        self.thread_agent.daemon = True
        self.thread_agent.start()

    def remove_agent(self):

        room = self.envs[self.box_agents.current()]

        if self.state[0] == room.objective[0] and self.state[
                1] == room.objective[1]:
            color = 'green'
        elif room.env_model()[self.state[0]][self.state[1]] == 0:
            color = 'red'
        elif room.env_model()[self.state[0]][self.state[1]] == 1:
            color = 'purple'

        self.canv.itemconfigure("state({},{})".format(self.state[0],
                                                      self.state[1]),
                                fill=color)

    def place_agent(self):

        self.canv.itemconfigure("state({},{})".format(self.state[0],
                                                      self.state[1]),
                                fill='yellow')

    def restart_anim(self):

        try:
            self.remove_agent()
        except IndexError:
            """"""

        room = self.envs[self.box_agents.current()]

        self.state = room.reset()

        self.canv.itemconfigure("state({},{})".format(room.objective[0],
                                                      room.objective[1]),
                                fill='green')

        self.place_agent()

        self.stopped = False
        self.restart.config(state='disabled')

    def anim_agent(self):
        total_steps = 0
        while self.stopped:
            time.sleep(0.3)

        while True:

            agent = self.agents[self.box_agents.current()]

            self.remove_agent()

            self.state, is_done = agent.play_step(self.state)

            self.place_agent()
            time.sleep(0.2)

            total_steps += 1

            if is_done:

                self.restart.config(state='active')
                self.restart.config(text='Restart')
                self.stopped = True

                print("Executed in {} actions".format(total_steps))
                while self.stopped:
                    time.sleep(0.3)

                time.sleep(0.5)

                total_steps = 0

                self.restart.config(state='disabled')

    def open_window(self):

        self.build_window()
        self.root.mainloop()
Exemple #7
0
    def __init__(self):
        """
        Agent is the entity that interacts with the environment
        """
        self.env = TheRoom(initial_state=(1, 1), objective=(7, 9))

        self.EPSILON = 0.2
        self.ALPHA = 1
        self.GAMMA = 0.9

        if UNIVERSAL_HYPERPARAM is False:
            self.EPSILON_OPT = 0.2
            self.ALPHA_OPT = 0.3
            self.GAMMA_OPT = 0.9

        else:
            self.EPSILON_OPT = self.EPSILON
            self.ALPHA_OPT = self.ALPHA
            self.GAMMA_OPT = self.GAMMA

        self.UPDATE = 1000000000
        self.DECAY = 0.02
        self.MIN_EPSILON = 0.02

        # general Q-table, with 0 as default value
        self.q_table = collections.defaultdict(float)

        # Set of possible options
        self.options_space = [12, 13, 21, 24, 31, 34, 42, 43]

        # Options can have a specific starting point (instead of being selectable in any state), i.e., each state can
        # start a specific set of options. This set is represented by the greek letter IOTA
        self.options_I = {
            12: [
                self.env.room1, self.env.hall_1_3
            ],  # option 12 can start on any state from room1 or on hallway between room 1 and 3
            13: [self.env.room1, self.env.hall_1_2],
            21: [self.env.room2, self.env.hall_2_4],
            24: [self.env.room2, self.env.hall_1_2],
            31: [self.env.room3, self.env.hall_3_4],
            34: [self.env.room3, self.env.hall_1_3],
            42: [self.env.room4, self.env.hall_3_4],
            43: [self.env.room4, self.env.hall_2_4],
        }

        # Each option has a (sub)-objective that tries to help the model to reach its main objective
        # This sub-objective is represented by the greek letter BETA
        self.options_B = {
            12: self.env.
            hall_1_2,  # option 12 objective is to reach the hallway between room 1 and 2
            13: self.env.hall_1_3,
            21: self.env.hall_1_2,
            24: self.env.hall_2_4,
            31: self.env.hall_1_3,
            34: self.env.hall_3_4,
            42: self.env.hall_2_4,
            43: self.env.hall_3_4
        }

        # each option has a specific policy/q-table
        self.q_table_o1 = collections.defaultdict(float)
        self.q_table_o2 = collections.defaultdict(float)
        self.q_table_o3 = collections.defaultdict(float)
        self.q_table_o4 = collections.defaultdict(float)
        self.q_table_o5 = collections.defaultdict(float)
        self.q_table_o6 = collections.defaultdict(float)
        self.q_table_o7 = collections.defaultdict(float)
        self.q_table_o8 = collections.defaultdict(float)

        self.options_q_tables = {
            12: self.q_table_o1,
            13: self.q_table_o2,
            21: self.q_table_o3,
            24: self.q_table_o4,
            31: self.q_table_o5,
            34: self.q_table_o6,
            42: self.q_table_o7,
            43: self.q_table_o8
        }
Exemple #8
0
class AgentQO:
    def __init__(self):
        """
        Agent is the entity that interacts with the environment
        """
        self.env = TheRoom(initial_state=(1, 1), objective=(7, 9))

        self.EPSILON = 0.2
        self.ALPHA = 1
        self.GAMMA = 0.9

        if UNIVERSAL_HYPERPARAM is False:
            self.EPSILON_OPT = 0.2
            self.ALPHA_OPT = 0.3
            self.GAMMA_OPT = 0.9

        else:
            self.EPSILON_OPT = self.EPSILON
            self.ALPHA_OPT = self.ALPHA
            self.GAMMA_OPT = self.GAMMA

        self.UPDATE = 1000000000
        self.DECAY = 0.02
        self.MIN_EPSILON = 0.02

        # general Q-table, with 0 as default value
        self.q_table = collections.defaultdict(float)

        # Set of possible options
        self.options_space = [12, 13, 21, 24, 31, 34, 42, 43]

        # Options can have a specific starting point (instead of being selectable in any state), i.e., each state can
        # start a specific set of options. This set is represented by the greek letter IOTA
        self.options_I = {
            12: [
                self.env.room1, self.env.hall_1_3
            ],  # option 12 can start on any state from room1 or on hallway between room 1 and 3
            13: [self.env.room1, self.env.hall_1_2],
            21: [self.env.room2, self.env.hall_2_4],
            24: [self.env.room2, self.env.hall_1_2],
            31: [self.env.room3, self.env.hall_3_4],
            34: [self.env.room3, self.env.hall_1_3],
            42: [self.env.room4, self.env.hall_3_4],
            43: [self.env.room4, self.env.hall_2_4],
        }

        # Each option has a (sub)-objective that tries to help the model to reach its main objective
        # This sub-objective is represented by the greek letter BETA
        self.options_B = {
            12: self.env.
            hall_1_2,  # option 12 objective is to reach the hallway between room 1 and 2
            13: self.env.hall_1_3,
            21: self.env.hall_1_2,
            24: self.env.hall_2_4,
            31: self.env.hall_1_3,
            34: self.env.hall_3_4,
            42: self.env.hall_2_4,
            43: self.env.hall_3_4
        }

        # each option has a specific policy/q-table
        self.q_table_o1 = collections.defaultdict(float)
        self.q_table_o2 = collections.defaultdict(float)
        self.q_table_o3 = collections.defaultdict(float)
        self.q_table_o4 = collections.defaultdict(float)
        self.q_table_o5 = collections.defaultdict(float)
        self.q_table_o6 = collections.defaultdict(float)
        self.q_table_o7 = collections.defaultdict(float)
        self.q_table_o8 = collections.defaultdict(float)

        self.options_q_tables = {
            12: self.q_table_o1,
            13: self.q_table_o2,
            21: self.q_table_o3,
            24: self.q_table_o4,
            31: self.q_table_o5,
            34: self.q_table_o6,
            42: self.q_table_o7,
            43: self.q_table_o8
        }

    def get_state_options_set(self, state):
        """
        get the set of options the said state can takes
        :param state: State to check the possible options it can take
        :return: the set of options for the said state
        """

        options_set = []

        for option, limits in self.options_I.items():

            room = limits[0]
            hallway = limits[1]

            # if the state is within the limits of the option, add it to the list of possible options
            if state[0] in range(room[0][0], room[1][0]) and state[1] in range(
                    room[0][1], room[1][1]):
                options_set.append(option)

            elif tuple(state) == tuple(hallway):
                options_set.append(option)

        return options_set

    def best_value_and_action(self, state, q_table, option=None):
        """
        Get form the Q-table the best action or option and its respective value
        :param q_table: q_table to search for best value
        :param state: state to search for best action
        :param option: type of action to chose
        :return: best Q(s,a/o) and its a/o
        """

        # check the kind of action to search for
        if option is None:
            set_actions = self.env.primordial_actions_space
        else:
            set_actions = self.options_space

        best_value, best_action = None, None

        # check every action on the selected state
        for action in set_actions:

            # q-table has 0 as default
            action_value = q_table[(state, action)]

            if best_value is None or best_value < action_value:

                best_value = action_value
                best_action = action

            # elif best_value == action_value:
            #     best_value = random.choices([best_value, action_value])[0]

        return best_value, best_action

    def get_action(self, state, q_table, explore=0.0, option=None):
        """
        Chooses an action or option, respecting the relation Exploit x Explore and using the q-table
        :param state: state on where to act
        :param q_table: q_table to consult for best value
        :param explore: EPSILON hyperparameter
        :param option: type of action to chose
        :return: Q(s, a/o) and a/o
        """

        if option is None:
            action_space = list.copy(self.env.primordial_actions_space)
        else:
            action_space = self.get_state_options_set(state)

        # if the sum of the possible actions for a given state is 0, it indicates that the agent never iterated
        # through here. In this case, we select the next action randomly, independent of the explore x exploit
        # reasoning. It must be done because, if we don't select at random, the agent will always take the first
        # action on the list, skewing the result in the direction of this action.
        aux = sum([q_table[state, act] for act in action_space])
        if aux == 0:
            action = random.choices(action_space)[0]

        else:

            _, action = self.best_value_and_action(state, q_table, option)

            # Explore
            if random.random() < explore:

                action_space.remove(action)
                action = random.choices(action_space)[0]

        return action

    def value_update(self, state, action, reward, next_s, q_table, steps=1):
        """
        update q-table according to the formula
        Q(s,a) = (1 - ALPHA) * oldQ(s,a) + ALPHA * (r + GAMMA * newQ(s,a))

        where old denotes the value before the update

        :param state: current state
        :param action: action taken
        :param reward: reward received
        :param next_s:  new state accessed
        :param q_table: q-table being updated
        :param steps: how many steps between s and next_s, required for options value update, since it uses the formula

        E{r + (GAMMA**k) * Vo(s')}, where k is the number of steps and r = ro + ... + (rt+k)*(GAMMA**k-1)

        :return:
        """

        if action in self.env.primordial_actions_space:
            best_v, _ = self.best_value_and_action(next_s, q_table)
        else:
            best_v, _ = self.best_value_and_action(next_s,
                                                   q_table,
                                                   option=True)

        if steps > 1:
            alpha = self.ALPHA_OPT
            gamma = self.GAMMA_OPT
        else:
            alpha = self.ALPHA
            gamma = self.GAMMA

        old_val = q_table[(state, action)]
        new_val = reward + (gamma**steps) * best_v

        q_table[(state, action)] = (1 - alpha) * old_val + alpha * new_val

    def play_episode(self, explore=0.0, options_trained=True):
        """
        Play an entire episode on the environment, i.e., agent navigates on it until the objective is reached
        :param explore: probability of taking a random option instead of best option (explore x exploit)
        :param options_trained: just an indicative if the options are already trained
        :return: number of interactions, i.e., how many times the agent selected an option; and total steps (time
        intervals) taken until the objective was reached
        """

        if options_trained is True:
            options_explore = 0.0
        else:
            options_explore = self.EPSILON_OPT

        # number of steps (k = 1) taken until the objective
        total_steps = 0

        # total reward accumulated
        total_reward = 0.0

        state = self.env.reset()
        is_done = False

        while True:

            option = self.get_action(state, self.q_table, explore, option=True)

            # interaction with env
            reward, steps, new_state = self.play_option(
                option, options_explore, state)

            total_steps += steps

            # if the main objective was reached, end of episode
            if new_state == tuple(self.env.objective):
                reward = reward + 1  # accessed objective = +1 reward
                is_done = True

            # keep updating q-table
            self.value_update(state, option, reward, new_state, self.q_table,
                              steps)

            total_reward += reward

            if is_done:
                break

            state = new_state

        return total_steps

    def play_step(self, state):
        """
        Execute an action on the chosen state. Action selection follows the q-table (there is no explore option)
        :param state: state where the action must be taken
        :return:
        """

        is_done = False

        option = self.get_action(state, self.q_table, 0, option=True)

        reward, steps, new_state = self.play_option(option, 0, state)

        if new_state == tuple(self.env.objective):
            is_done = True

        return new_state, is_done

    def play_option(self, option, explore, state=None):
        """
        Each option take N steps to accomplish its sub-objective, which end the interactions.
        These steps use regular actions, therefore, to select the best one, a q-table particular to the option is used.

        q-table -> policy

        :param option: option being executed
        :param explore: Explore x Exploit on the selecion of actions for the option
        :param state: state where the option must start
        :return: total reward accumulated from the main q-table, number of steps that generated reward (state transaction),
        total number of steps (even the atemps to move to a wall), final state reached,
        """

        # option's q-table
        q_table = self.options_q_tables[option]

        # each option has its own objective
        sub_objective = self.options_B[option]

        # take default initial state if no other is indicated
        if state is None:
            state = self.env.reset()

        total_steps = 0

        # reward from the main policy
        total_reward = 0.0

        # relative_reward is specific for the option q-table
        relative_reward = 0

        is_done = False

        while True:

            action = self.get_action(state, q_table, explore)

            # interaction with env
            new_state, reward, _ = self.env.step(action, state)

            total_steps += 1

            if new_state == tuple(sub_objective):
                # we reward the agent only when the objective is reached, thus, there is no need to accumulate the reward
                relative_reward = 1
                is_done = True

            # if the state is outside the option's states set (its room + hallway), it shouldn't be accessed by it.
            # The interaction is ignored
            if not is_done:

                options_set = self.get_state_options_set(new_state)

                if option not in options_set:
                    continue

            # if the agent tried to move to a wall, the state doesn't change. If thats the case, there is no update to do
            if state != new_state:
                self.value_update(state, action, relative_reward, new_state,
                                  q_table)

            state = new_state

            total_reward = (self.GAMMA**total_steps) * reward

            if is_done:
                break

        return total_reward, total_steps, state

    def training(self, verbose=False):
        """
        Training both model and options
        :return:
        """
        self.q_table.clear()
        self.q_table_o1.clear()
        self.q_table_o2.clear()
        self.q_table_o3.clear()
        self.q_table_o4.clear()
        self.q_table_o5.clear()
        self.q_table_o6.clear()
        self.q_table_o7.clear()
        self.q_table_o8.clear()

        self.env = TheRoom((1, 1), (7, 9))

        episodes = 0
        explore = self.EPSILON

        model_conv_steps = 0
        min_steps = 9999
        steps_episodes = []

        while 1:

            steps = self.play_episode(explore, options_trained=False)
            episodes += 1

            model_conv_steps += steps
            steps_episodes.append(steps)

            # min tracer
            if steps < min_steps:
                min_steps = steps

            if episodes % self.UPDATE == 0:

                if explore < self.DECAY:
                    explore = self.MIN_EPSILON
                else:
                    explore = explore - self.DECAY

            if min_steps <= CONVERGENCE:

                if verbose:
                    print(
                        "\nModel converged on {} episodes, after executing {} steps. "
                        "Best result was {} steps\n".format(
                            episodes, model_conv_steps, min_steps))
                break

        return episodes, model_conv_steps, steps_episodes

    def random_hyper_parameter_tuning(self, trainining_opts=False):
        """
        :return:
        """

        interactions = []

        epsilons = [0.2]
        alphas = [1]
        gammas = [0.9]

        # epsilons = [0.1, 0.2, 0.3]
        # alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 1]
        # gammas = [0.9, 0.95, 0.99]

        trained = []

        best_steps = 9999999
        lower_std = 999999
        best_hyper = 0

        train_test = 0
        while 1:

            if len(trained) >= (len(epsilons) * len(alphas) * len(gammas)):
                break

            else:

                if trainining_opts is True:
                    self.EPSILON_OPT = random.choices(epsilons)[0]
                    self.ALPHA_OPT = random.choices(alphas)[0]
                    self.GAMMA_OPT = random.choices(gammas)[0]

                    training = (self.EPSILON_OPT, self.ALPHA_OPT,
                                self.GAMMA_OPT)
                else:
                    self.EPSILON = random.choices(epsilons)[0]
                    self.ALPHA = random.choices(alphas)[0]
                    self.GAMMA = random.choices(gammas)[0]

                    # self.EPSILON_OPT = self.EPSILON
                    # self.ALPHA_OPT = self.ALPHA
                    # self.GAMMA_OPT = self.GAMMA

                    training = (self.EPSILON, self.ALPHA, self.GAMMA)

                if training in trained:
                    continue
                else:
                    trained.append(training)

                interactions.clear()

                print("\nTraining {}, with {}".format(train_test, training))

                for _ in range(TRAINING):

                    if trainining_opts is True:
                        inter = self.training_options()
                    else:
                        _, inter, _ = self.training()

                    interactions.append(inter)

                self.remove_outliers(interactions, percent=OUTLIER)
                mean_int = np.mean(interactions)
                std_int = np.std(interactions)

                print("Avg: {0:.2f}, std {1:.2f}.".format(mean_int, std_int))

                if mean_int < best_steps and std_int < lower_std:

                    best_steps = mean_int
                    lower_std = std_int
                    best_hyper = training
                    print("new avg min: {0:.2f}, std {1:.2f}. Parameters {2}".
                          format(best_steps, lower_std, best_hyper))

                train_test += 1

        print("Best avg steps: {0:.2f}, std {1:.2f}. Best parameters {2}.".
              format(best_steps, lower_std, best_hyper))

    def training_options(self, verbose=False):
        """
        Train options separated from the main objective
        :return:
        """

        total_steps = 0

        op_starting = {
            12: (1, 1),
            13: (1, 1),
            21: self.env.hall_2_4,
            24: self.env.hall_1_2,
            31: self.env.hall_3_4,
            34: self.env.hall_1_3,
            42: self.env.hall_3_4,
            43: self.env.hall_2_4
        }

        self.q_table_o1.clear()
        self.q_table_o2.clear()
        self.q_table_o3.clear()
        self.q_table_o4.clear()
        self.q_table_o5.clear()
        self.q_table_o6.clear()
        self.q_table_o7.clear()
        self.q_table_o8.clear()

        for op in self.options_space:

            self.env = TheRoom(op_starting[op], self.options_B[op])

            total_reward, op_steps, _ = self.play_option(
                op, explore=self.EPSILON_OPT)

            if verbose is True:
                print("Option {} converged on {} steps".format(op, op_steps))

            total_steps += op_steps

        return total_steps

    def remove_outliers(self, data, percent):
        """

        :param percent:
        :return:
        """
        if percent == 0:
            return

        if percent > 100:
            percent = 100

        percent = percent / 2
        percent = percent / 100

        remove_n = int(len(data) * percent)

        for _ in range(0, remove_n):
            data.remove(max(data))

        for _ in range(0, remove_n):
            data.remove(min(data))

    def count_outlier_3sigma(self, data, limit):

        mean = np.mean(data)
        std = np.std(data)

        up_outliers = [x for x in data if (x > (mean + limit * std))]

        down_outliers = [x for x in data if (x < (mean - limit * std))]

        return len(up_outliers) + len(down_outliers)

    def export_csv(self, steps_episode):

        with open("./q-learningOptions.csv", "w", newline='') as csvfile:

            writer = csv.writer(csvfile)

            columns = [("Episodes"), ("Steps")]
            writer.writerow(columns)

            episodes = 1
            for step in steps_episode:

                formatted = [(episodes), step]
                writer.writerow(formatted)

                episodes += 1