class SACQ(SACU): def __init__(self, env, qmodel: QNetwork, amodel: PolicyNetwork, tasks, gamma: float = None, num_learn: int = 10, steps_per_episode: int = 1000, scheduler_period: int = 150, num_avg_gradient: int = 10, listeners=None, temperature=1): if gamma is None: gamma = qmodel.gamma super().__init__(env, qmodel, amodel, tasks, gamma, num_learn, steps_per_episode, scheduler_period, num_avg_gradient, listeners) self.Q = QTable() self.M = defaultdict(lambda: 0) self.scheduler = self.Q.derive_policy(BoltzmannPolicy, lambda x: self.tasks, temperature=temperature) def train_scheduler(self, tau, Tau): main_task = self.tasks[0] xi = self.scheduler_period main_rewards = [r[main_task] for _, _, r, _ in tau] for h in range(len(Tau)): R = sum([r * self.gamma**k for k, r in enumerate(main_rewards[h*xi:])]) self.M[Tau[h]] += 1 #self.Q[tuple(Tau[:h]), Tau[h]] += (R - self.Q[tuple(Tau[:h]), Tau[h]])/self.M[Tau[h]] # We used a Q-Table with 0.1 learning rate to update the values in the table. # Change 0.1 to the desired learning rate self.Q[tuple(Tau[:h]), Tau[h]] += 0.1 * (R - self.Q[tuple(Tau[:h]), Tau[h]]) def schedule_task(self, Tau): return self.scheduler.sample(tuple(Tau))
class MonteCarlo(Agent): """ Monte Carlo Agent implementation """ def __init__(self, env: FiniteActionEnvironment, gamma: float = 1.0): """ Create a new MonteCarlo Agent :param env: The environment the agent will learn from :param gamma: Reward discount factor """ super().__init__(env) self.q_table = QTable() self.visit_count = defaultdict(int) self.policy = self.q_table.derive_policy(EpsilonGreedyPolicy, env.valid_actions_from, epsilon=self.epsilon) self.gamma = gamma def learn(self, num_iter=100000) -> EpsilonGreedyPolicy: """ Learn a policy from the environment :param num_iter: The number of iterations the algorithm should run :return: the derived policy """ Q, N, pi = self.q_table, self.visit_count, self.policy for _ in range(num_iter): s = self.env.reset() e, r = [], 0 while not s.is_terminal(): # Execute an episode a = pi.sample(s) e += [[s, a]] s, r = self.env.step(a) e[-1] += [r] for i, (s, a, r) in enumerate(reversed(e)): # Reverse rewards so G can be computed efficiently g = r if i == 0 else g * self.gamma + r N[s, a] += 1 N[s] += 1 Q[s, a] += (1 / N[s, a]) * (g - Q[s, a]) return pi def epsilon(self, s): N_0, N = 100, self.visit_count return N_0 / (N_0 + N[s])