Ejemplo n.º 1
0
def sarsa_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        num_episodes: float = 10000,
        eps: float = 0.1,
        base_lr: float = 0.03,
        half_life: float = 1000.0,
        exponent: float = 0.5) -> Mapping[S, float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    state = Categorical({state: 1 for state in states}).sample()
    for i in range(num_episodes):
        action_distribution = Pi.act(state)
        action = action_distribution.sample()
        next_distribution = mdp_to_sample.step(state, action)
        next_state, reward = next_distribution.sample()
        next_action = Pi.act(next_state).sample()
        counts_per_state_act[(state, action)] += 1
        alpha = base_lr / (1 + (
            (counts_per_state_act[(state, action)] - 1) / half_life)**exponent)
        #We choose the next action based on epsilon greedy policy
        q[(state,
           action)] += alpha * (reward + γ * q[(next_state, next_action)] -
                                q[(state, action)])
        new_pol: Mapping[S, Optional[Categorical[A]]] = Pi.policy_map
        if actions[state] is None:
            new_pol[state] = None
        policy_map = {
            action: eps / len(actions[state])
            for action in actions[state]
        }
        best_action = actions[state][0]
        for action in actions[state]:
            if q[(state, best_action)] <= q[(state, action)]:
                best_action = action
        policy_map[best_action] += 1 - eps
        new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)
        state = next_state
        if next_state is None:
            state = Categorical({state: 1 for state in states}).sample()
    return q
Ejemplo n.º 2
0
def mc_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        tolerance: float = 1e-6,
        num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    start_state_distrib = Categorical({state: 1 for state in states})
    for i in range(num_episodes):
        trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions(
            start_state_distrib, Pi)
        episode = returns(trace, γ, tolerance)
        #print(episode)
        for step in episode:
            state = step.state
            action = step.action
            return_ = step.return_
            counts_per_state_act[(state, action)] += 1
            q[(state, action)] += 1 / counts_per_state_act[
                (state, action)] * (return_ - q[(state, action)])
        eps = 1 / (i + 1)
        new_pol: Mapping[S, Optional[Categorical[A]]] = {}
        for state in states:
            if actions[state] is None:
                new_pol[state] = None
            policy_map = {
                action: eps / len(actions[state])
                for action in actions[state]
            }
            best_action = actions[state][0]
            for action in actions[state]:
                if q[(state, best_action)] <= q[(state, action)]:
                    best_action = action
            policy_map[best_action] += 1 - eps
            new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)

    return q
Ejemplo n.º 3
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess[S, A],
    gamma: float,
    matrix_method_for_mrp_eval: bool = False
) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]:
    '''Calculate the value function (V*) of the given MDP by improving
    the policy repeatedly after evaluating the value function for a policy
    '''

    def update(vf_policy: Tuple[V[S], FinitePolicy[S, A]])\
            -> Tuple[V[S], FinitePolicy[S, A]]:

        vf, pi = vf_policy
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)
        policy_vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in
                           enumerate(mrp.get_value_function_vec(gamma))}\
            if matrix_method_for_mrp_eval else evaluate_mrp_result(mrp, gamma)
        improved_pi: FinitePolicy[S, A] = greedy_policy_from_vf(
            mdp, policy_vf, gamma)

        return policy_vf, improved_pi

    v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states}
    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s)))
         for s in mdp.non_terminal_states})
    return iterate(update, (v_0, pi_0))
Ejemplo n.º 4
0
    def get_q_learning_vf_and_policy(
            self,
            states_actions_dict: Mapping[Cell, Optional[Set[Move]]],
            sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
            episodes: int = 10000,
            step_size: float = 0.01,
            epsilon: float = 0.1) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            state: Cell = uniform_states.sample()
            '''
            write your code here
            update the dictionary q initialized above according
            to the Q-learning algorithm's Q-Value Function updates.
            '''

        vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[Cell, Move] = FinitePolicy({
            s: Constant(max(d.items(), key=itemgetter(1))[0])
            for s, d in q.items()
        })
        return (vf_dict, policy)
Ejemplo n.º 5
0
def main(num_pads):
    # 2^(num_pads-2) deterministic policies
    fc_mdp: FiniteMarkovDecisionProcess[FrogState,
                                        Any] = FrogCroak(num_pads + 1)
    all_fp = list(itertools.product(['A', 'B'], repeat=fc_mdp.num_pads - 2))
    all_mrp_value = []
    for fp in all_fp:
        fdp: FinitePolicy[FrogState, Any] = FinitePolicy(
            {FrogState(i + 1): Constant(fp[i])
             for i in range(len(fp))})
        implied_mrp: FiniteMarkovRewardProcess[
            FrogState] = fc_mdp.apply_finite_policy(fdp)
        all_mrp_value.append(implied_mrp.get_value_function_vec(1))

    # find the optimized policy
    max_indices = []
    value_matrix = np.array(all_mrp_value)
    for i in range(num_pads - 1):
        max_indices.append(np.argmax(value_matrix[:, i]))
    max_index = list(set(max_indices))[0]
    print(value_matrix[max_index, :])
    print(all_fp[max_index])
    plt.plot([
        'State' + str(i + 1) + ',' + all_fp[max_index][i]
        for i in range(num_pads - 1)
    ], value_matrix[max_index, :], 'o')
    plt.xlabel('Frog State')
    plt.ylabel('Probability')
    plt.title('n = ' + str(num_pads - 1))
    plt.show()
Ejemplo n.º 6
0
    def setUp(self):
        user_capacity = 2
        user_poisson_lambda = 1.0
        user_holding_cost = 1.0
        user_stockout_cost = 10.0

        self.gamma = 0.9

        self.si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
            SimpleInventoryMDPCap(
                capacity=user_capacity,
                poisson_lambda=user_poisson_lambda,
                holding_cost=user_holding_cost,
                stockout_cost=user_stockout_cost
            )

        self.fdp: FinitePolicy[InventoryState, int] = FinitePolicy({
            InventoryState(alpha, beta):
            Constant(user_capacity - (alpha + beta))
            for alpha in range(user_capacity + 1)
            for beta in range(user_capacity + 1 - alpha)
        })

        self.implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
            self.si_mdp.apply_finite_policy(self.fdp)

        self.states: Sequence[InventoryState] = \
            self.implied_mrp.non_terminal_states
Ejemplo n.º 7
0
    def setUp(self):
        ii = 12
        self.steps = 8
        pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)]
        self.cp: ClearancePricingMDP = ClearancePricingMDP(
            initial_inventory=ii,
            time_steps=self.steps,
            price_lambda_pairs=pairs)

        def policy_func(x: int) -> int:
            return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3))

        stationary_policy: FinitePolicy[int, int] = FinitePolicy(
            {s: Constant(policy_func(s))
             for s in range(ii + 1)})

        self.single_step_mrp: FiniteMarkovRewardProcess[
            int] = self.cp.single_step_mdp.apply_finite_policy(
                stationary_policy)

        self.mrp_seq = unwrap_finite_horizon_MRP(
            finite_horizon_MRP(self.single_step_mrp, self.steps))

        self.single_step_mdp: FiniteMarkovDecisionProcess[
            int, int] = self.cp.single_step_mdp

        self.mdp_seq = unwrap_finite_horizon_MDP(
            finite_horizon_MDP(self.single_step_mdp, self.steps))
Ejemplo n.º 8
0
def optimal_vf_and_policy(
        steps: Sequence[StateActionMapping[S, A]],
        gamma: float) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]:
    """Use backwards induction to find the optimal value function and optimal
    policy at each time step

    """
    v_p: List[Tuple[Dict[S, float], FinitePolicy[S, A]]] = []

    for step in reversed(steps):
        this_v: Dict[S, float] = {}
        this_a: Dict[S, FiniteDistribution[A]] = {}
        for s, actions_map in step.items():
            if actions_map is not None:
                action_values = ((
                    res.expectation(lambda s_r: s_r[1] + gamma * (
                        v_p[-1][0][s_r[0]]
                        if len(v_p) > 0 and s_r[0] in v_p[-1][0] else 0.0)),
                    a,
                ) for a, res in actions_map.items())
                v_star, a_star = max(action_values, key=itemgetter(0))
                this_v[s] = v_star
                this_a[s] = Constant(a_star)
        v_p.append((this_v, FinitePolicy(this_a)))

    return reversed(v_p)
Ejemplo n.º 9
0
 def get_all_deterministic_policies(self) -> Sequence[FinitePolicy[LilypadState, str]]:
     bin_to_act = {'0':'A', '1':'B'}
     all_action_comb = self.get_all_action_combinations()
     all_policies = []
     for action_comb in all_action_comb:
         policy: FinitePolicy[LilypadState,str] = FinitePolicy(
             {LilypadState(i+1): Constant(bin_to_act[a]) for i,a in enumerate(action_comb)}
         )
         all_policies.append(policy)
     return all_policies
Ejemplo n.º 10
0
def get_policies(n)->Iterable[FinitePolicy[StatePond,Action]]: 
    list_policies: Iterable[FinitePolicy[StatePond,Action]] = []
    liste_actions:list = list(itertools.product(['A','B'],repeat=n-1))
    for i in liste_actions:
        policy_map: Mapping[StatePond, Optional[FiniteDistribution[Action]]] = {}
        policy_map[StatePond(0)] = None
        policy_map[StatePond(n)] = None
        for j in range(0,n-1):
            policy_map[StatePond(j+1)] = Constant(Action(i[j]))
        list_policies+=[FinitePolicy(policy_map)]
    return list_policies
Ejemplo n.º 11
0
def get_opt_vf_from_q(q_value:Mapping[Tuple[S,A],float])\
    ->Tuple[Mapping[S,float],FinitePolicy[S,A]]:
    v: Mapping[S, float] = {}
    policy_map: Mapping[S, Optional[Constant[A]]] = {}
    for i in q_value:
        state, action = i
        if state not in v.keys() or q_value[i] > v[state]:
            v[state] = q_value[i]
            policy_map[state] = Constant(action)
    Pi = FinitePolicy(policy_map)
    return (v, Pi)
Ejemplo n.º 12
0
def get_vf_and_policy_from_qvf(
        mdp: FiniteMarkovDecisionProcess[S, A],
        qvf: FunctionApprox[Tuple[S, A]]) -> Tuple[V[S], FinitePolicy[S, A]]:
    opt_vf: V[S] = {
        s: max(qvf((s, a)) for a in mdp.actions(s))
        for s in mdp.non_terminal_states
    }
    opt_policy: FinitePolicy[S, A] = FinitePolicy({
        s: Constant(qvf.argmax((s, a) for a in mdp.actions(s))[1])
        for s in mdp.non_terminal_states
    })
    return opt_vf, opt_policy
Ejemplo n.º 13
0
def greedy_policy_from_vf(mdp: FiniteMarkovDecisionProcess[S, A], vf: V[S],
                          gamma: float) -> FinitePolicy[S, A]:
    greedy_policy_dict: Dict[S, FiniteDistribution[A]] = {}

    for s in mdp.non_terminal_states:

        q_values: Iterator[Tuple[A, float]] = \
            ((a, mdp.mapping[s][a].expectation(
                lambda s_r: s_r[1] + gamma * vf.get(s_r[0], 0.)
            )) for a in mdp.actions(s))

        greedy_policy_dict[s] =\
            Constant(max(q_values, key=operator.itemgetter(1))[0])

    return FinitePolicy(greedy_policy_dict)
Ejemplo n.º 14
0
def initialize(
    mdp: FiniteMarkovDecisionProcess
) -> Tuple[V[S], FinitePolicy]:
    """Initialize value function and policy.

    Initialize the value function to zeros at each state, and initialize the
    policy to a random choice of the action space at each non-terminal state.

    :param mdp: Object representation of a finite Markov decision process
    :returns: Value function initialized at zeros for each state
    :returns: Random Initial policy
    """
    # Set value function at each state equal to zero
    v_0: V[S] = {s: 0 for s in mdp.states()}
    # Set the policy to be a random choice of the action space at each state
    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s))) for s in mdp.non_terminal_states}
    )
    return v_0, pi_0
Ejemplo n.º 15
0
    def get_sarsa_vf_and_policy(
            self,
            states_actions_dict: Mapping[Cell, Optional[Set[Move]]],
            sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
            episodes: int = 10000,
            step_size: float = 0.01
    ) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items() if actions is not None}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            epsilon: float = 1.0 / (episode_num + 1)
            state: Cell = uniform_states.sample()
            action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon)
            while state in nt_states:
                next_state, reward = sample_func(state, action)
                if next_state in nt_states:
                    next_action: Move = WindyGrid.epsilon_greedy_action(
                        next_state, q, epsilon)
                    q[state][action] += step_size * \
                        (reward + q[next_state][next_action] -
                         q[state][action])
                    action = next_action
                else:
                    q[state][action] += step_size * (reward - q[state][action])
                state = next_state

        vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()}
        policy: FinitePolicy[Cell, Move] = FinitePolicy({
            s: Constant(max(d.items(), key=itemgetter(1))[0])
            for s, d in q.items()
        })
        return (vf_dict, policy)
Ejemplo n.º 16
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess[S, A], gamma: float,
    approx0: FunctionApprox[S]
) -> Iterator[Tuple[FunctionApprox[S], FinitePolicy[S, A]]]:
    '''Calculate the value function (V*) of the given MDP by improving
    the policy repeatedly after evaluating the value function for a policy
    '''

    def update(vf_policy: Tuple[FunctionApprox[S], FinitePolicy[S, A]])\
            -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]:

        vf, pi = vf_policy
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)
        #policy_vf: FunctionApprox[S] = approximate_policy_evaluation_result(mdp,pi,vf)
        policy_vf: FunctionApprox[S] = evaluate_mrp_result(mrp, gamma, vf)
        improved_pi: FinitePolicy[S, A] = greedy_policy_from_approx_vf(
            mdp, policy_vf, gamma)
        return policy_vf, improved_pi

    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s)))
         for s in mdp.non_terminal_states})
    return iterate(update, (approx0, pi_0))
Ejemplo n.º 17
0
def get_optimality(n: int) -> Tuple[FinitePolicy, np.ndarray]:
    fl_mdp = FrogAndLilypadsMDP(n)
    print(fl_mdp.get_action_transition_reward_map())
    deterministic_policies = product("AB", repeat=n - 1)
    odp = None
    ovf = None
    for prod in deterministic_policies:
        policy_map = {0: None, n: None}
        for i in range(1, n):
            policy_map[i] = Categorical({prod[i - 1]: 1})
        policy = FinitePolicy(policy_map)
        fl_mrp = fl_mdp.apply_finite_policy(policy)
        value_function = fl_mrp.get_value_function_vec(1)
        if odp == None:
            odp = policy
            odp_keys = prod
            ovf = value_function
        else:
            comparison = [(value_function[i] > ovf[i]) for i in range(n - 1)]
            if all(comparison):
                odp = policy
                odp_keys = prod
                ovf = value_function
    return ((odp_keys, ovf))
Ejemplo n.º 18
0
    # start state distribution: every non-terminal state has equal probability to be the start state
    start_states = Categorical({
        state: 1 / len(si_mdp.non_terminal_states)
        for state in si_mdp.non_terminal_states
    })

    mc_tabular_control = mc_control(si_mdp, start_states,
                                    Tabular(start_map, start_map), user_gamma,
                                    800)
    values_map = mc_tabular_control.values_map
    opt_vf, opt_pi = get_optimal_policy(values_map)
    print('opt_vf mc control: \n', opt_vf, '\nopt_pi mc control: \n', opt_pi)

    fdp: FinitePolicy[InventoryState, int] = FinitePolicy({
        InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta))
        for alpha in range(user_capacity + 1)
        for beta in range(user_capacity + 1 - alpha)
    })
    implied_mrp: FiniteMarkovRewardProcess[InventoryState] = \
        si_mdp.apply_finite_policy(fdp)

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
    print(opt_vf_vi, '\n')
    print(opt_policy_vi)

    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(si_mdp,
                                                       gamma=user_gamma)
Ejemplo n.º 19
0
def compare_mc_sarsa_ql(fmdp: FiniteMarkovDecisionProcess[S, A],
                        method_mask: Tuple[bool, bool, bool],
                        learning_rates: Sequence[Tuple[float, float,
                                                       float]], gamma: float,
                        epsilon_as_func_of_episodes: Callable[[int], float],
                        q_learning_epsilon: float,
                        mc_episode_length_tol: float, num_episodes: int,
                        plot_batch: int, plot_start: int) -> None:
    true_vf: V[S] = value_iteration_result(fmdp, gamma)[0]
    states: Sequence[S] = fmdp.non_terminal_states
    colors: Sequence[str] = ['b', 'g', 'r', 'k', 'c', 'm', 'y']

    import matplotlib.pyplot as plt
    plt.figure(figsize=(11, 7))

    if method_mask[0]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            mc_funcs_it: Iterator[FunctionApprox[Tuple[S, A]]] = \
                glie_mc_finite_control_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
                    episode_length_tolerance=mc_episode_length_tol
                )
            mc_errors = []
            batch_mc_errs = []
            for i, mc_qvf in enumerate(
                    itertools.islice(mc_funcs_it, num_episodes)):
                mc_vf: V[S] = {
                    s: max(mc_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_mc_errs.append(
                    sqrt(
                        sum((mc_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % plot_batch == plot_batch - 1:
                    mc_errors.append(sum(batch_mc_errs) / plot_batch)
                    batch_mc_errs = []
            mc_plot = mc_errors[plot_start:]
            label = f"MC InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(mc_plot)),
                     mc_plot,
                     color=colors[k],
                     linestyle='-',
                     label=label)

    sample_episodes: int = 1000
    uniform_policy: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(fmdp.actions(s)))
         for s in states})
    fmrp: FiniteMarkovRewardProcess[S] = \
        fmdp.apply_finite_policy(uniform_policy)
    td_episode_length: int = int(
        round(
            sum(
                len(
                    list(
                        returns(trace=fmrp.simulate_reward(Choose(set(
                            states))),
                                γ=gamma,
                                tolerance=mc_episode_length_tol)))
                for _ in range(sample_episodes)) / sample_episodes))

    if method_mask[1]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            sarsa_funcs_it: Iterator[FunctionApprox[Tuple[S, A]]] = \
                glie_sarsa_finite_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
                    max_episode_length=td_episode_length,
                )
            sarsa_errors = []
            transitions_batch = plot_batch * td_episode_length
            batch_sarsa_errs = []

            for i, sarsa_qvf in enumerate(
                    itertools.islice(sarsa_funcs_it,
                                     num_episodes * td_episode_length)):
                sarsa_vf: V[S] = {
                    s: max(sarsa_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_sarsa_errs.append(
                    sqrt(
                        sum((sarsa_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % transitions_batch == transitions_batch - 1:
                    sarsa_errors.append(
                        sum(batch_sarsa_errs) / transitions_batch)
                    batch_sarsa_errs = []
            sarsa_plot = sarsa_errors[plot_start:]
            label = f"SARSA InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(sarsa_plot)),
                     sarsa_plot,
                     color=colors[k],
                     linestyle='--',
                     label=label)

    if method_mask[2]:
        for k, (init_lr, half_life, exponent) in enumerate(learning_rates):
            ql_funcs_it: Iterator[FunctionApprox[Tuple[S, A]]] = \
                q_learning_finite_learning_rate(
                    fmdp=fmdp,
                    initial_learning_rate=init_lr,
                    half_life=half_life,
                    exponent=exponent,
                    gamma=gamma,
                    epsilon=q_learning_epsilon,
                    max_episode_length=td_episode_length,
                )
            ql_errors = []
            transitions_batch = plot_batch * td_episode_length
            batch_ql_errs = []

            for i, ql_qvf in enumerate(
                    itertools.islice(ql_funcs_it,
                                     num_episodes * td_episode_length)):
                ql_vf: V[S] = {
                    s: max(ql_qvf((s, a)) for a in fmdp.actions(s))
                    for s in states
                }
                batch_ql_errs.append(
                    sqrt(
                        sum((ql_vf[s] - true_vf[s])**2
                            for s in states) / len(states)))
                if i % transitions_batch == transitions_batch - 1:
                    ql_errors.append(sum(batch_ql_errs) / transitions_batch)
                    batch_ql_errs = []
            ql_plot = ql_errors[plot_start:]
            label = f"Q-Learning InitRate={init_lr:.3f},HalfLife" + \
                f"={half_life:.0f},Exp={exponent:.1f}"
            plt.plot(range(len(ql_plot)),
                     ql_plot,
                     color=colors[k],
                     linestyle=':',
                     label=label)

    plt.xlabel("Episode Batches", fontsize=20)
    plt.ylabel("Optimal Value Function RMSE", fontsize=20)
    plt.title("RMSE as function of episode batches", fontsize=20)
    plt.grid(True)
    plt.legend(fontsize=10)
    plt.show()
Ejemplo n.º 20
0
        FrogEscapeMDP(
            n = 6,
            initial_pad = 1
        )

    print("MDP Transition Map")
    print("------------------")
    print(fe_mdp)

    # setup deterministic policy
    fdp: FinitePolicy[FrogEscapeState, int] = FinitePolicy(
        {
            FrogEscapeState(i): Categorical({
                0: 0.5,
                1: 0.5
            })
            for i in range(1, fe_mdp.n)
        }
        # {FrogEscapeState(i):
        #  Categorial({}) if i == 0 else (Constant(1) if i == fe_mdp.n else Constant(1)) for i in range(fe_mdp.n+1)}
    )

    print("Policy Map")
    print("----------")
    print(fdp)

    implied_mrp: FiniteMarkovRewardProcess[FrogEscapeState] =\
        fe_mdp.apply_finite_policy(fdp)
    print("Implied MP Transition Map")
    print("--------------")
    print(FiniteMarkovProcess(implied_mrp.transition_map))
Ejemplo n.º 21
0
    steps = 8
    pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)]
    cp: ClearancePricingMDP = ClearancePricingMDP(
        initial_inventory=ii,
        time_steps=steps,
        price_lambda_pairs=pairs
    )
    print("Clearance Pricing MDP")
    print("---------------------")
    print(cp.mdp)

    def policy_func(x: int) -> int:
        return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3))

    stationary_policy: FinitePolicy[int, int] = FinitePolicy(
        {s: Constant(policy_func(s)) for s in range(ii + 1)}
    )

    single_step_mrp: FiniteMarkovRewardProcess[int] = \
        cp.single_step_mdp.apply_finite_policy(stationary_policy)

    vf_for_policy: Iterator[V[int]] = evaluate(
        unwrap_finite_horizon_MRP(finite_horizon_MRP(single_step_mrp, steps)),
        1.
    )

    print("Value Function for Stationary Policy")
    print("------------------------------------")
    for t, vf in enumerate(vf_for_policy):
        print(f"Time Step {t:d}")
        print("---------------")