Esempio n. 1
0
    def learn(self):
        # print('Learning Session')

        batch_s, batch_s_, batch_r, batch_terminal, batch_a_indices_one_hot, batch_a_indices = \
            self.memory.sample_batch(self.memory_batch_size)

        q_eval_s = self.policy_nn.forward(batch_s)
        q_eval_s_ = self.policy_nn.forward(
            batch_s_) if self.target_nn is None else self.target_nn.forward(
                batch_s_)

        if self.lib_type == LIBRARY_TORCH:
            self.policy_nn.learn_batch(batch_a_indices, batch_r,
                                       batch_terminal, self.GAMMA,
                                       self.memory_batch_size, q_eval_s,
                                       q_eval_s_)
        else:
            q_target_chosen_a = batch_r + self.GAMMA * np.max(
                q_eval_s_, axis=1) * batch_terminal
            self.policy_nn.learn_batch(batch_s, batch_a_indices_one_hot,
                                       q_target_chosen_a)

        self.learn_step_counter += 1

        if self.learn_step_counter > self.pure_exploration_phase:
            self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec,
                                     self.eps_dec_type)
Esempio n. 2
0
    def perform_off_policy_mc_control(self,
                                      print_info=False,
                                      visualize=False,
                                      record=False):
        # off-policy methods are the alternative to non-exploring-starts

        if record:
            self.env = wrappers.Monitor(self.env,
                                        'recordings/OP-MC/',
                                        force=True,
                                        video_callable=lambda episode_id:
                                        episode_id == 0 or episode_id ==
                                        (self.episodes - 1))

        target_policy, Q, C = {}, {}, {}
        for s in self.states:
            target_policy[s] = self.env.action_space.sample()
            for a in range(self.action_space_size):
                Q[s, a] = 0
                C[s, a] = 0

        accumulated_scores = 0

        print('\n', 'Game Started', '\n')

        for i in range(self.episodes):
            done = False
            ep_steps = 0
            ep_score = 0

            behavior_policy = {}
            for s in self.states:
                rand = np.random.random()
                behavior_policy[s] = [target_policy[s]] \
                    if rand > self.EPS \
                    else [a for a in range(self.action_space_size)]

            memory = []
            observation = self.env.reset()

            s = self.custom_env.get_state(observation)

            if visualize and i == self.episodes - 1:
                self.env.render()

            while not done:
                a = np.random.choice(behavior_policy[s])

                observation_, reward, done, info = self.env.step(a)
                ep_steps += 1
                ep_score += reward
                accumulated_scores += reward

                s_ = self.custom_env.get_state(observation_)

                memory.append((s, a, reward))

                observation, s = observation_, s_

                if visualize and i == self.episodes - 1:
                    self.env.render()

            if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0:
                print('episode %d - score: %d, steps: %d' %
                      (i + 1, ep_score, ep_steps))

            self.totalSteps[i] = ep_steps
            self.totalScores[i] = ep_score
            self.totalAccumulatedScores[i] = accumulated_scores

            if visualize and i == self.episodes - 1:
                self.env.close()

            ####################

            G = 0
            W = 1
            for s, a, reward in reversed(memory):  # from end to start
                G = self.GAMMA * G + reward  # calculate discounted return

                C[s, a] += W
                Q[s, a] += (W / C[s, a]) * (G - Q[s, a])

                target_policy[s] = max_action_q(Q, s, self.action_space_size)

                # taking a sub-optimal action breaks the learning loop
                #   it only learns from greedy actions - this is a shortcoming of the class of algorithms
                #   this makes the off-policy MC a sub-optimal strategy for MC methods
                if a != target_policy[s]:
                    break

                if len(behavior_policy[s]) == 1:  # agent took a greedy action
                    prob = 1 - self.EPS  # probability of taking a greedy action.
                else:  # agent took a random action
                    prob = self.EPS / len(
                        behavior_policy[s]
                    )  # probability of taking a random action.
                W *= 1 / prob  # updating the weight

            self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec,
                                     self.eps_dec_type)

        if print_info:
            print_q(Q)
            print_policy(Q, target_policy)

        print('\n', 'Game Ended', '\n')

        return target_policy, self.totalScores, self.totalAccumulatedScores
Esempio n. 3
0
    def perform_mc_non_exploring_starts_control(self,
                                                print_info=False,
                                                visualize=False,
                                                record=False):
        # Monte Carlo control without exploring starts
        #   we use epsilon greedy with a decaying epsilon

        if record:
            self.env = wrappers.Monitor(self.env,
                                        'recordings/MC-NES/',
                                        force=True,
                                        video_callable=lambda episode_id:
                                        episode_id == 0 or episode_id ==
                                        (self.episodes - 1))

        Q, states_actions_visited_counter = init_q1_q2(self.states,
                                                       self.action_space_size)
        accumulated_scores = 0

        print('\n', 'Game Started', '\n')

        for i in range(self.episodes):
            done = False
            ep_steps = 0
            ep_score = 0

            memory = []
            observation = self.env.reset()

            s = self.custom_env.get_state(observation)

            if visualize and i == self.episodes - 1:
                self.env.render()

            while not done:
                a = eps_greedy_q(Q, s, self.action_space_size, self.EPS,
                                 self.env)

                observation_, reward, done, info = self.env.step(a)
                ep_steps += 1
                ep_score += reward
                accumulated_scores += reward

                s_ = self.custom_env.get_state(observation_)

                memory.append((s, a, reward))

                observation, s = observation_, s_

                if visualize and i == self.episodes - 1:
                    self.env.render()

            if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0:
                print('episode %d - score: %d, steps: %d' %
                      (i + 1, ep_score, ep_steps))

            self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec,
                                     self.eps_dec_type)

            self.totalSteps[i] = ep_steps
            self.totalScores[i] = ep_score
            self.totalAccumulatedScores[i] = accumulated_scores

            if visualize and i == self.episodes - 1:
                self.env.close()

            ####################

            ep_states_actions_returns = calculate_episode_states_actions_returns(
                memory, self.GAMMA)

            ep_states_actions_visited = []
            for s, a, G in ep_states_actions_returns:
                if (s, a) not in ep_states_actions_visited:  # first visit
                    ep_states_actions_visited.append((s, a))
                    states_actions_visited_counter[s, a] += 1

                    # Incremental Implementation
                    # (of the update rule for the agent's estimate of the discounted future rewards)
                    #   this is a shortcut that saves you from calculating the average of a function every single time
                    #   (computationally expensive and doesn't really get you anything in terms of accuracy)
                    # new estimate = old estimate + [sample - old estimate] / N
                    Q[s,
                      a] += (G - Q[s, a]) / states_actions_visited_counter[s,
                                                                           a]

        policy = get_policy_table_from_q_table(self.states, Q,
                                               self.action_space_size)

        if print_info:
            print_q(Q)
            print_policy(Q, policy)

        print('\n', 'Game Ended', '\n')

        return policy, self.totalScores, self.totalAccumulatedScores
    def perform_double_q_learning(self, visualize=False, record=False):
        if record:
            self.env = wrappers.Monitor(self.env,
                                        'recordings/D-Q-L/',
                                        force=True,
                                        video_callable=lambda episode_id:
                                        episode_id == 0 or episode_id ==
                                        (self.episodes - 1))

        Q1, Q2 = init_q1_q2(self.states, self.action_space_size)

        accumulated_scores = 0

        print('\n', 'Game Started', '\n')

        for i in range(self.episodes):
            done = False
            ep_steps = 0
            ep_score = 0

            observation = self.env.reset()

            s = self.custom_env.get_state(observation)

            if visualize and i == self.episodes - 1:
                self.env.render()

            while not done:
                a = eps_greedy_q1_q2(Q1, Q2, s, self.action_space_size,
                                     self.EPS, self.env)

                observation_, reward, done, info = self.env.step(a)
                ep_steps += 1
                ep_score += reward
                accumulated_scores += reward

                s_ = self.custom_env.get_state(observation_)
                rand = np.random.random()
                if rand <= 0.5:
                    a_ = max_action_q1_q2(Q1, Q1, s_, self.action_space_size)
                    Q1[s,
                       a] += self.ALPHA * (reward + self.GAMMA * Q2[s_, a_] -
                                           Q1[s, a])
                else:  # elif rand > 0.5
                    a_ = max_action_q1_q2(Q2, Q2, s_, self.action_space_size)
                    Q2[s,
                       a] += self.ALPHA * (reward + self.GAMMA * Q1[s_, a_] -
                                           Q2[s, a])

                observation, s = observation_, s_

                if visualize and i == self.episodes - 1:
                    self.env.render()

            if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0:
                print('episode %d - eps: %.2f, score: %d, steps: %d' %
                      (i + 1, self.EPS, ep_score, ep_steps))

            self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec,
                                     self.eps_dec_type)

            self.totalSteps[i] = ep_steps
            self.totalScores[i] = ep_score
            self.totalAccumulatedScores[i] = accumulated_scores

            if visualize and i == self.episodes - 1:
                self.env.close()

        print('\n', 'Game Ended', '\n')

        return Q1, Q2, self.totalScores, self.totalAccumulatedScores
    def perform_q_learning(self, visualize=False, record=False, pickle=False):
        if record:
            self.env = wrappers.Monitor(self.env,
                                        'recordings/Q-L/',
                                        force=True,
                                        video_callable=lambda episode_id:
                                        episode_id == 0 or episode_id ==
                                        (self.episodes - 1))

        Q = init_q(self.states, self.action_space_size,
                   self.custom_env.file_name, pickle)

        accumulated_scores = 0

        print('\n', 'Game Started', '\n')

        for i in range(self.episodes):
            done = False
            ep_steps = 0
            ep_score = 0

            observation = self.env.reset()

            s = self.custom_env.get_state(observation)

            if visualize and i == self.episodes - 1:
                self.env.render()

            while not done:
                a = eps_greedy_q(Q, s, self.action_space_size, self.EPS,
                                 self.env)

                observation_, reward, done, info = self.env.step(a)
                ep_steps += 1
                ep_score += reward
                accumulated_scores += reward

                s_ = self.custom_env.get_state(observation_)
                a_ = max_action_q(Q, s_, self.action_space_size)
                Q[s, a] += self.ALPHA * (reward + self.GAMMA * Q[s_, a_] -
                                         Q[s, a])
                # Q[s, a] += self.ALPHA * (reward + self.GAMMA * np.max(Q[s_, :]) - Q[s, a])  # if Q is a numpy.ndarray

                observation, s = observation_, s_

                if visualize and i == self.episodes - 1:
                    self.env.render()

            if self.episodes < 10 or (i + 1) % (self.episodes // 10) == 0:
                print('episode %d - eps: %.2f, score: %d, steps: %d' %
                      (i + 1, self.EPS, ep_score, ep_steps))

            self.EPS = decrement_eps(self.EPS, self.eps_min, self.eps_dec,
                                     self.eps_dec_type)

            self.totalSteps[i] = ep_steps
            self.totalScores[i] = ep_score
            self.totalAccumulatedScores[i] = accumulated_scores

            if visualize and i == self.episodes - 1:
                self.env.close()

        print('\n', 'Game Ended', '\n')

        if pickle:
            pickle_save(Q, self.custom_env.file_name + '-q-table')

        return Q, self.totalScores, self.totalAccumulatedScores