Example #1
0
class SACQ(SACU):
    def __init__(self, env, qmodel: QNetwork, amodel: PolicyNetwork, tasks, gamma: float = None, num_learn: int = 10,
                 steps_per_episode: int = 1000, scheduler_period: int = 150, num_avg_gradient: int = 10,
                 listeners=None, temperature=1):
        if gamma is None:
            gamma = qmodel.gamma
        super().__init__(env, qmodel, amodel, tasks, gamma, num_learn, steps_per_episode, scheduler_period,
                         num_avg_gradient, listeners)
        self.Q = QTable()
        self.M = defaultdict(lambda: 0)
        self.scheduler = self.Q.derive_policy(BoltzmannPolicy, lambda x: self.tasks, temperature=temperature)


    def train_scheduler(self, tau, Tau):
        main_task = self.tasks[0]
        xi = self.scheduler_period
        main_rewards = [r[main_task] for _, _, r, _ in tau]
        for h in range(len(Tau)):
            R = sum([r * self.gamma**k for k, r in enumerate(main_rewards[h*xi:])])
            self.M[Tau[h]] += 1
            #self.Q[tuple(Tau[:h]), Tau[h]] += (R - self.Q[tuple(Tau[:h]), Tau[h]])/self.M[Tau[h]]

            # We used a Q-Table with 0.1 learning rate to update the values in the table.
            # Change 0.1 to the desired learning rate
            self.Q[tuple(Tau[:h]), Tau[h]] += 0.1 * (R - self.Q[tuple(Tau[:h]), Tau[h]])

    def schedule_task(self, Tau):
        return self.scheduler.sample(tuple(Tau))
Example #2
0
class MonteCarlo(Agent):
    """
        Monte Carlo Agent implementation
    """

    def __init__(self, env: FiniteActionEnvironment, gamma: float = 1.0):
        """
        Create a new MonteCarlo Agent
        :param env: The environment the agent will learn from
        :param gamma: Reward discount factor
        """
        super().__init__(env)
        self.q_table = QTable()
        self.visit_count = defaultdict(int)
        self.policy = self.q_table.derive_policy(EpsilonGreedyPolicy,
                                                 env.valid_actions_from,
                                                 epsilon=self.epsilon)
        self.gamma = gamma

    def learn(self, num_iter=100000) -> EpsilonGreedyPolicy:
        """
        Learn a policy from the environment
        :param num_iter: The number of iterations the algorithm should run
        :return: the derived policy
        """
        Q, N, pi = self.q_table, self.visit_count, self.policy
        for _ in range(num_iter):
            s = self.env.reset()
            e, r = [], 0
            while not s.is_terminal():                          # Execute an episode
                a = pi.sample(s)
                e += [[s, a]]
                s, r = self.env.step(a)
                e[-1] += [r]
            
            for i, (s, a, r) in enumerate(reversed(e)):         # Reverse rewards so G can be computed efficiently
                g = r if i == 0 else g * self.gamma + r
                N[s, a] += 1
                N[s] += 1
                Q[s, a] += (1 / N[s, a]) * (g - Q[s, a])
        return pi

    def epsilon(self, s):
        N_0, N = 100, self.visit_count
        return N_0 / (N_0 + N[s])