def choose_option(self, state, epsilon=0.01): """ Picks the optimal option in an epsilon-greedy way """ if random.random() > epsilon: option = self.options[randargmax(self.Q[state])] else: option = random.choice(self.options) return option
def pmf(self, state, action_values): if isinstance(action_values, torch.Tensor): action_values = action_values.detach().numpy().squeeze() probs = np.array([o.initiation(state) for o in self.options], dtype=np.float) action_values[np.array(np.abs(probs - 1), dtype=bool)] = np.inf probs[probs == 1] = self.ε / (probs.sum() - 1) probs[randargmax(action_values)] = 1 - self.ε assert np.sum(probs) == 1. return probs
def _q_learning(self, q_values: np.array = None, epsilon: float = 0.1, alpha: float = 0.1, gamma: float = 0.9): """ learns the optimal policy to get to the hallway from anywhere within a room """ # State is number of cells in the grid plus the direction of agent # Actions are primitive {left, right, forward} if q_values is None: q_values = np.zeros((3, 4, 10, 10)) state = ( self.env.agent_dir, *self.env.agent_pos, ) done = False while not done: # self.env.render() # time.sleep(0.0005) a = randargmax(q_values[:, state[0], state[1], state[2]]) a = self._epsilon_greedy(a, epsilon) obs, reward, done, info = self.env.step(a) # Note: we could infer the state of the agent from obs, but get it directly instead state_next = (self.env.agent_dir, *self.env.agent_pos) a_next = randargmax(q_values[:, state_next[0], state_next[1], state_next[2]]) q_index = a, state[0], state[1], state[2] q_index_next = a_next, state_next[0], state_next[1], state_next[2] q_values[q_index] += alpha * (reward + gamma * (q_values[q_index_next]) - q_values[q_index]) state = state_next return q_values
def sample(self, phi): if self.rng.uniform() < self.epsilon: return int(self.rng.randint(self.weights.shape[1])) return randargmax(self.value(phi))
def q_learning(self, n_episodes: int, γ: float = 0.9, Q: np.ndarray = None, N: np.ndarray = None, α: float = None, render: bool = False): env = self.env.unwrapped n_options = len(self.options) state_space_dim = (4, env.width, env.height) dim = (n_options, *state_space_dim) if Q is None: N = np.zeros(dim) Q = np.zeros(dim) for episode in range(n_episodes): self.env.reset() state = (env.agent_dir, *reversed(env.agent_pos)) executing_option = self.policy(Q, state) done = False while not done: # Step through environment a = executing_option.policy(state) obs, reward, done, info = self.env.step(a) # TODO: infer the state of the agent from obs, i.e. make it POMDP s_next = (env.agent_dir, *reversed(env.agent_pos)) if render: action_name = list(env.actions)[a].name self.logger.debug(f"State: {state}, " f"Option: {executing_option}, " f"Action: {action_name}, " f"Next State: {s_next}") self.env.render() time.sleep(0.05) # Update option executing_option.k += 1 executing_option.cumulant += γ**executing_option.k * reward # Check for termination condition and update action-values if executing_option.termination_function(s_next) == 1 or done: start_state = (self.option_idx_dict[executing_option.name], *executing_option.starting_state) # Determine the step-size if α is None: N[start_state] += 1 alpha = 1 / N[start_state] else: alpha = α # Update Q in the direction of the optimal action r = executing_option.cumulant k = executing_option.k o = randargmax(Q[(slice(None), *s_next)]) target = r + γ**k * Q[(o, *s_next)] Q[start_state] += alpha * (target - Q[start_state]) # Choose the next option executing_option = self.policy(Q, s_next) # Reset the state state = s_next yield Q, self.env.step_count return Q, self.env.step_count