コード例 #1
0
ファイル: lspi.py プロジェクト: GYY7/RL-book
def lspi(memory: List[mdp.TransitionStep[S]], feature_map: Dict[Tuple[S, A],
                                                                List[float]],
         state_action: Dict[S, List[A]], m: int, gamma: float,
         ϵ: float) -> Iterable[Dict[Tuple[S, A], float]]:
    """
    update A and b to get w*= inverse(A)b and update deterministic policy
    feature_map:  key: state, value: phi(s_i) is a vector of dimension m
    """
    # initialize A, b
    A = np.random.rand(m, m)
    b = np.zeros((m, 1))
    w = np.linalg.inv(A) @ b
    while True:
        transition = random.choice(memory)
        state = transition.state
        next_state = transition.next_state
        feature_state = np.array(feature_map[(state, transition.action)])
        # next_action is derived from ϵ-policy
        explore = Bernoulli(ϵ)
        if explore.sample():
            next_action = Choose(set(state_action[next_state])).sample()
        else:
            next_action = state_action[next_state][np.argmax([
                np.array(feature_map[(next_state, action)]) @ w
                for action in state_action[next_state]
            ])]
        feature_next_state = np.array(feature_map[(next_state, next_action)])
        A += feature_state @ (feature_state - gamma * feature_next_state).T
        b += feature_state * transition.reward
        w = np.linalg.inv(A) @ b
        yield {
            s_a: np.array(feature_map[s_a]) @ w
            for s_a in feature_map.keys()
        }
コード例 #2
0
ファイル: sarsa_control_GLIE.py プロジェクト: GYY7/RL-book
def sarsa_control(start_states: Distribution[S],
                  transition_fcn: Callable[[S, A], Tuple[S, float]],
                  state_action: Mapping[S, List[A]],
                  approx_0: FunctionApprox[Tuple[S, A]], gamma: float,
                  ϵ: float) -> Iterable[FunctionApprox[Tuple[S, A]]]:
    """
    Update Q-value function approximate using SARSA
    Initialize first state by start_states
    """
    q = approx_0
    state = start_states.sample()
    action = Choose(set(state_action[state])).sample()
    while True:
        # next_state, reward = transition_fcn(state, action)
        next_state, reward = transition_fcn[state][action].sample()
        # use ϵ-greedy policy to get next_action
        explore = Bernoulli(ϵ)
        if explore.sample():
            next_action = Choose(set(state_action[next_state])).sample()
        else:
            next_action = state_action[next_state][np.argmax(
                [q((next_state, a)) for a in state_action[next_state]])]
        q = q.update([(state, action), reward + gamma * q(
            (next_state, next_action))])
        state, action = next_state, next_action
        yield q
コード例 #3
0
def policy_from_q(q: FunctionApprox[Tuple[S, A]],
                  mdp: MarkovDecisionProcess[S, A],
                  ϵ: float = 0.0) -> Policy[S, A]:
    """Return a policy that chooses the action that maximizes the reward
    for each state in the given Q function.

    Arguments:
      q -- approximation of the Q function for the MDP
      mdp -- the process for which we're generating a policy
      ϵ -- the fraction of the actions where we explore rather
      than following the optimal policy

    Returns a policy based on the given Q function.

    """
    explore = Bernoulli(ϵ)

    class QPolicy(Policy[S, A]):
        def act(self, s: S) -> Optional[Distribution[A]]:
            if mdp.is_terminal(s):
                return None

            if explore.sample():
                return Choose(set(mdp.actions(s)))

            _, action = q.argmax((s, a) for a in mdp.actions(s))
            return Constant(action)

    return QPolicy()
コード例 #4
0
ファイル: test_markov_process.py プロジェクト: mindis/RL-book
        def next_state(state=state):
            switch_states = Bernoulli(self.p).sample()

            if switch_states:
                next_s = not state
                reward = 1 if state else 0.5
                return next_s, reward
            else:
                return state, 0.5
コード例 #5
0
        def next_state(state=state):
            switch_states = Bernoulli(self.p).sample()

            st: bool = state.state
            if switch_states:
                next_s: bool = not st
                reward = 1 if st else 0.5
                return NonTerminal(next_s), reward
            else:
                return NonTerminal(st), 0.5
コード例 #6
0
ファイル: test_distribution.py プロジェクト: shenoy1/RL-book
class TestBernoulli(unittest.TestCase):
    def setUp(self):
        self.fair = Bernoulli(0.5)
        self.unfair = Bernoulli(0.3)

    def test_constant(self):
        assert_almost_equal(self, self.fair,
                            Categorical({
                                True: 0.5,
                                False: 0.5
                            }))
        self.assertAlmostEqual(self.fair.probability(True), 0.5)
        self.assertAlmostEqual(self.fair.probability(False), 0.5)

        assert_almost_equal(self, self.unfair,
                            Categorical({
                                True: 0.3,
                                False: 0.7
                            }))
        self.assertAlmostEqual(self.unfair.probability(True), 0.3)
        self.assertAlmostEqual(self.unfair.probability(False), 0.7)
コード例 #7
0
 def get_episode_rewards_actions(self) -> Tuple[ndarray, ndarray]:
     counts: List[int] = [self.count_init] * self.num_arms
     means: List[float] = [self.mean_init] * self.num_arms
     ep_rewards: ndarray = empty(self.time_steps)
     ep_actions: ndarray = empty(self.time_steps, dtype=int)
     for i in range(self.time_steps):
         max_action: int = max(enumerate(means), key=itemgetter(1))[0]
         epsl: float = self.epsilon_func(i)
         action: int = max_action if Bernoulli(1 - epsl).sample() else \
             Range(self.num_arms).sample()
         reward: float = self.arm_distributions[action].sample()
         counts[action] += 1
         means[action] += (reward - means[action]) / counts[action]
         ep_rewards[i] = reward
         ep_actions[i] = action
     return ep_rewards, ep_actions
コード例 #8
0
ファイル: test_distribution.py プロジェクト: shenoy1/RL-book
 def setUp(self):
     self.fair = Bernoulli(0.5)
     self.unfair = Bernoulli(0.3)
コード例 #9
0
ファイル: test_distribution.py プロジェクト: shenoy1/RL-book
 def test_categorical(self):
     assert_almost_equal(self, self.normalized, Bernoulli(0.3))
     self.assertAlmostEqual(self.normalized.probability(True), 0.3)
     self.assertAlmostEqual(self.normalized.probability(False), 0.7)
     self.assertAlmostEqual(self.normalized.probability(None), 0.)
コード例 #10
0
ファイル: plot_mab_graphs.py プロジェクト: shenoy1/RL-book
def plot_bernoulli_algorithms() -> None:
    probs_data = [0.1, 0.2, 0.4, 0.5, 0.6, 0.75, 0.8, 0.85, 0.9]
    mu_star = max(probs_data)

    steps = 500
    episodes = 500

    eps = 0.3
    eps_hl = 400

    ci = 5
    mi = mu_star * 3.

    ucb_alpha = 4.0

    lr = 0.5
    lr_decay = 20.

    arm_distrs = [Bernoulli(p) for p in probs_data]

    greedy_opt_init = EpsilonGreedy(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        epsilon=0.,
        epsilon_half_life=1e8,
        count_init=ci,
        mean_init=mi
    )
    eps_greedy = EpsilonGreedy(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        epsilon=eps,
        epsilon_half_life=1e8,
        count_init=0,
        mean_init=0.
    )
    decay_eps_greedy = EpsilonGreedy(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        epsilon=eps,
        epsilon_half_life=eps_hl,
        count_init=0,
        mean_init=0.
    )
    ucb1 = UCB1(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        bounds_range=1.0,
        alpha=ucb_alpha
    )
    ts = ThompsonSamplingBernoulli(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes
    )
    grad_bandits = GradientBandits(
        arm_distributions=arm_distrs,
        time_steps=steps,
        num_episodes=episodes,
        learning_rate=lr,
        learning_rate_decay=lr_decay
    )

    plot_colors = ['r', 'b', 'g', 'y', 'k', 'c']
    labels = [
        'Greedy, Optimistic Initialization',
        '$\epsilon$-Greedy',
        'Decaying $\epsilon$-Greedy',
        'UCB1',
        'Thompson Sampling',
        'Gradient Bandit'
    ]

    exp_cum_regrets = [
        greedy_opt_init.get_expected_cum_regret(mu_star),
        eps_greedy.get_expected_cum_regret(mu_star),
        decay_eps_greedy.get_expected_cum_regret(mu_star),
        ucb1.get_expected_cum_regret(mu_star),
        ts.get_expected_cum_regret(mu_star),
        grad_bandits.get_expected_cum_regret(mu_star)
    ]

    x_vals = range(1, steps + 1)
    for i in range(len(exp_cum_regrets)):
        plt.plot(exp_cum_regrets[i], color=plot_colors[i], label=labels[i])
    plt.xlabel("Time Steps", fontsize=20)
    plt.ylabel("Expected Total Regret", fontsize=20)
    plt.title("Total Regret Curves", fontsize=25)
    plt.xlim(xmin=x_vals[0], xmax=x_vals[-1])
    plt.ylim(ymin=0.0)
    plt.grid(True)
    plt.legend(loc='upper left', fontsize=15)
    plt.show()

    exp_act_counts = [
        greedy_opt_init.get_expected_action_counts(),
        eps_greedy.get_expected_action_counts(),
        decay_eps_greedy.get_expected_action_counts(),
        ucb1.get_expected_action_counts(),
        ts.get_expected_action_counts(),
        grad_bandits.get_expected_action_counts()
    ]
    index = arange(len(probs_data))
    spacing = 0.4
    width = (1 - spacing) / len(exp_act_counts)

    for i in range(len(exp_act_counts)):
        plt.bar(
            index - (1 - spacing) / 2 + (i - 1.5) * width,
            exp_act_counts[i],
            width,
            color=plot_colors[i],
            label=labels[i]
        )
    plt.xlabel("Arms", fontsize=20)
    plt.ylabel("Expected Counts of Arms", fontsize=20)
    plt.title("Arms Counts Plot", fontsize=25)
    plt.xticks(
        index - 0.2,
        ["$p$=%.2f" % p for p in probs_data]
    )
    plt.legend(loc='upper left', fontsize=15)
    plt.tight_layout()
    plt.show()
コード例 #11
0
ファイル: test_markov_process.py プロジェクト: mindis/RL-book
 def sample_states(self):
     return Bernoulli(self.p)
コード例 #12
0
ファイル: test_markov_process.py プロジェクト: mindis/RL-book
 def sample_states(self) -> Distribution[bool]:
     return Bernoulli(self.p)
コード例 #13
0
ファイル: test_markov_process.py プロジェクト: mindis/RL-book
 def next_state(state=state):
     switch_states = Bernoulli(self.p).sample()
     return not state if switch_states else state
コード例 #14
0
        bayes: List[Tuple[int, int]] = [(1, 1)] * self.num_arms

        for i in range(self.time_steps):
            mean_draws: Sequence[float] = \
                [Beta(α=alpha, β=beta).sample() for alpha, beta in bayes]
            action: int = max(enumerate(mean_draws), key=itemgetter(1))[0]
            reward: float = float(self.arm_distributions[action].sample())
            alpha, beta = bayes[action]
            bayes[action] = (alpha + int(reward), beta + int(1 - reward))
            ep_rewards[i] = reward
            ep_actions[i] = action
        return ep_rewards, ep_actions


if __name__ == '__main__':
    probs_data = [0.2, 0.4, 0.8, 0.5, 0.1, 0.9]
    mu_star = max(probs_data)
    steps = 1000
    episodes = 500

    arm_distrs = [Bernoulli(p) for p in probs_data]
    ts_bernoulli = ThompsonSamplingBernoulli(arm_distributions=arm_distrs,
                                             time_steps=steps,
                                             num_episodes=episodes)
    # exp_cum_regret = ts_bernoulli.get_expected_cum_regret(mu_star)
    # print(exp_cum_regret)
    # exp_act_count = ts_bernoulli.get_expected_action_counts()
    # print(exp_act_count)

    ts_bernoulli.plot_exp_cum_regret_curve(mu_star)
コード例 #15
0
 def next_state(state=state):
     switch_states = Bernoulli(self.p).sample()
     next_st: bool = not state.state if switch_states else state.state
     return NonTerminal(next_st)