def lspi(memory: List[mdp.TransitionStep[S]], feature_map: Dict[Tuple[S, A], List[float]], state_action: Dict[S, List[A]], m: int, gamma: float, ϵ: float) -> Iterable[Dict[Tuple[S, A], float]]: """ update A and b to get w*= inverse(A)b and update deterministic policy feature_map: key: state, value: phi(s_i) is a vector of dimension m """ # initialize A, b A = np.random.rand(m, m) b = np.zeros((m, 1)) w = np.linalg.inv(A) @ b while True: transition = random.choice(memory) state = transition.state next_state = transition.next_state feature_state = np.array(feature_map[(state, transition.action)]) # next_action is derived from ϵ-policy explore = Bernoulli(ϵ) if explore.sample(): next_action = Choose(set(state_action[next_state])).sample() else: next_action = state_action[next_state][np.argmax([ np.array(feature_map[(next_state, action)]) @ w for action in state_action[next_state] ])] feature_next_state = np.array(feature_map[(next_state, next_action)]) A += feature_state @ (feature_state - gamma * feature_next_state).T b += feature_state * transition.reward w = np.linalg.inv(A) @ b yield { s_a: np.array(feature_map[s_a]) @ w for s_a in feature_map.keys() }
def sarsa_control(start_states: Distribution[S], transition_fcn: Callable[[S, A], Tuple[S, float]], state_action: Mapping[S, List[A]], approx_0: FunctionApprox[Tuple[S, A]], gamma: float, ϵ: float) -> Iterable[FunctionApprox[Tuple[S, A]]]: """ Update Q-value function approximate using SARSA Initialize first state by start_states """ q = approx_0 state = start_states.sample() action = Choose(set(state_action[state])).sample() while True: # next_state, reward = transition_fcn(state, action) next_state, reward = transition_fcn[state][action].sample() # use ϵ-greedy policy to get next_action explore = Bernoulli(ϵ) if explore.sample(): next_action = Choose(set(state_action[next_state])).sample() else: next_action = state_action[next_state][np.argmax( [q((next_state, a)) for a in state_action[next_state]])] q = q.update([(state, action), reward + gamma * q( (next_state, next_action))]) state, action = next_state, next_action yield q
def policy_from_q(q: FunctionApprox[Tuple[S, A]], mdp: MarkovDecisionProcess[S, A], ϵ: float = 0.0) -> Policy[S, A]: """Return a policy that chooses the action that maximizes the reward for each state in the given Q function. Arguments: q -- approximation of the Q function for the MDP mdp -- the process for which we're generating a policy ϵ -- the fraction of the actions where we explore rather than following the optimal policy Returns a policy based on the given Q function. """ explore = Bernoulli(ϵ) class QPolicy(Policy[S, A]): def act(self, s: S) -> Optional[Distribution[A]]: if mdp.is_terminal(s): return None if explore.sample(): return Choose(set(mdp.actions(s))) _, action = q.argmax((s, a) for a in mdp.actions(s)) return Constant(action) return QPolicy()
def next_state(state=state): switch_states = Bernoulli(self.p).sample() if switch_states: next_s = not state reward = 1 if state else 0.5 return next_s, reward else: return state, 0.5
def next_state(state=state): switch_states = Bernoulli(self.p).sample() st: bool = state.state if switch_states: next_s: bool = not st reward = 1 if st else 0.5 return NonTerminal(next_s), reward else: return NonTerminal(st), 0.5
class TestBernoulli(unittest.TestCase): def setUp(self): self.fair = Bernoulli(0.5) self.unfair = Bernoulli(0.3) def test_constant(self): assert_almost_equal(self, self.fair, Categorical({ True: 0.5, False: 0.5 })) self.assertAlmostEqual(self.fair.probability(True), 0.5) self.assertAlmostEqual(self.fair.probability(False), 0.5) assert_almost_equal(self, self.unfair, Categorical({ True: 0.3, False: 0.7 })) self.assertAlmostEqual(self.unfair.probability(True), 0.3) self.assertAlmostEqual(self.unfair.probability(False), 0.7)
def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]: counts: List[int] = [self.count_init] * self.num_arms means: List[float] = [self.mean_init] * self.num_arms ep_rewards: ndarray = empty(self.time_steps) ep_actions: ndarray = empty(self.time_steps, dtype=int) for i in range(self.time_steps): max_action: int = max(enumerate(means), key=itemgetter(1))[0] epsl: float = self.epsilon_func(i) action: int = max_action if Bernoulli(1 - epsl).sample() else \ Range(self.num_arms).sample() reward: float = self.arm_distributions[action].sample() counts[action] += 1 means[action] += (reward - means[action]) / counts[action] ep_rewards[i] = reward ep_actions[i] = action return ep_rewards, ep_actions
def setUp(self): self.fair = Bernoulli(0.5) self.unfair = Bernoulli(0.3)
def test_categorical(self): assert_almost_equal(self, self.normalized, Bernoulli(0.3)) self.assertAlmostEqual(self.normalized.probability(True), 0.3) self.assertAlmostEqual(self.normalized.probability(False), 0.7) self.assertAlmostEqual(self.normalized.probability(None), 0.)
def plot_bernoulli_algorithms() -> None: probs_data = [0.1, 0.2, 0.4, 0.5, 0.6, 0.75, 0.8, 0.85, 0.9] mu_star = max(probs_data) steps = 500 episodes = 500 eps = 0.3 eps_hl = 400 ci = 5 mi = mu_star * 3. ucb_alpha = 4.0 lr = 0.5 lr_decay = 20. arm_distrs = [Bernoulli(p) for p in probs_data] greedy_opt_init = EpsilonGreedy( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=0., epsilon_half_life=1e8, count_init=ci, mean_init=mi ) eps_greedy = EpsilonGreedy( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=eps, epsilon_half_life=1e8, count_init=0, mean_init=0. ) decay_eps_greedy = EpsilonGreedy( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, epsilon=eps, epsilon_half_life=eps_hl, count_init=0, mean_init=0. ) ucb1 = UCB1( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, bounds_range=1.0, alpha=ucb_alpha ) ts = ThompsonSamplingBernoulli( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes ) grad_bandits = GradientBandits( arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes, learning_rate=lr, learning_rate_decay=lr_decay ) plot_colors = ['r', 'b', 'g', 'y', 'k', 'c'] labels = [ 'Greedy, Optimistic Initialization', '$\epsilon$-Greedy', 'Decaying $\epsilon$-Greedy', 'UCB1', 'Thompson Sampling', 'Gradient Bandit' ] exp_cum_regrets = [ greedy_opt_init.get_expected_cum_regret(mu_star), eps_greedy.get_expected_cum_regret(mu_star), decay_eps_greedy.get_expected_cum_regret(mu_star), ucb1.get_expected_cum_regret(mu_star), ts.get_expected_cum_regret(mu_star), grad_bandits.get_expected_cum_regret(mu_star) ] x_vals = range(1, steps + 1) for i in range(len(exp_cum_regrets)): plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i]) plt.xlabel("Time Steps", fontsize=20) plt.ylabel("Expected Total Regret", fontsize=20) plt.title("Total Regret Curves", fontsize=25) plt.xlim(xmin=x_vals[0], xmax=x_vals[-1]) plt.ylim(ymin=0.0) plt.grid(True) plt.legend(loc='upper left', fontsize=15) plt.show() exp_act_counts = [ greedy_opt_init.get_expected_action_counts(), eps_greedy.get_expected_action_counts(), decay_eps_greedy.get_expected_action_counts(), ucb1.get_expected_action_counts(), ts.get_expected_action_counts(), grad_bandits.get_expected_action_counts() ] index = arange(len(probs_data)) spacing = 0.4 width = (1 - spacing) / len(exp_act_counts) for i in range(len(exp_act_counts)): plt.bar( index - (1 - spacing) / 2 + (i - 1.5) * width, exp_act_counts[i], width, color=plot_colors[i], label=labels[i] ) plt.xlabel("Arms", fontsize=20) plt.ylabel("Expected Counts of Arms", fontsize=20) plt.title("Arms Counts Plot", fontsize=25) plt.xticks( index - 0.2, ["$p$=%.2f" % p for p in probs_data] ) plt.legend(loc='upper left', fontsize=15) plt.tight_layout() plt.show()
def sample_states(self): return Bernoulli(self.p)
def sample_states(self) -> Distribution[bool]: return Bernoulli(self.p)
def next_state(state=state): switch_states = Bernoulli(self.p).sample() return not state if switch_states else state
bayes: List[Tuple[int, int]] = [(1, 1)] * self.num_arms for i in range(self.time_steps): mean_draws: Sequence[float] = \ [Beta(α=alpha, β=beta).sample() for alpha, beta in bayes] action: int = max(enumerate(mean_draws), key=itemgetter(1))[0] reward: float = float(self.arm_distributions[action].sample()) alpha, beta = bayes[action] bayes[action] = (alpha + int(reward), beta + int(1 - reward)) ep_rewards[i] = reward ep_actions[i] = action return ep_rewards, ep_actions if __name__ == '__main__': probs_data = [0.2, 0.4, 0.8, 0.5, 0.1, 0.9] mu_star = max(probs_data) steps = 1000 episodes = 500 arm_distrs = [Bernoulli(p) for p in probs_data] ts_bernoulli = ThompsonSamplingBernoulli(arm_distributions=arm_distrs, time_steps=steps, num_episodes=episodes) # exp_cum_regret = ts_bernoulli.get_expected_cum_regret(mu_star) # print(exp_cum_regret) # exp_act_count = ts_bernoulli.get_expected_action_counts() # print(exp_act_count) ts_bernoulli.plot_exp_cum_regret_curve(mu_star)
def next_state(state=state): switch_states = Bernoulli(self.p).sample() next_st: bool = not state.state if switch_states else state.state return NonTerminal(next_st)