Example #1
0
def glie_mc_finite_control_learning_rate(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    gamma: float,
    epsilon_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-5
) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return mc.glie_mc_control(
        mdp=fmdp,
        states=Choose(fmdp.non_terminal_states),
        approx_0=Tabular(values_map=initial_qvf_dict,
                         count_to_weight_func=learning_rate_func),
        γ=gamma,
        ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
        episode_length_tolerance=episode_length_tolerance)
Example #2
0
def td_prediction(transitions: Iterable[mp.TransitionStep[S]],
                  count_to_weight_func: Callable[[int], float],
                  gamma: float,
                  max_steps: int = 5000) -> Tabular[S]:
    """
    Similar as Monte Carlo Scratch except replacing return y with R_{t+1} + gamma*V(S_{t+1}) for updates
    """
    values_map: Dict[S, float] = {}
    counts_map: Dict[S, int] = {}
    count = 0
    diff = {}  # dict: state and its value error
    for transition in transitions:
        if count < max_steps:
            state = transition.state
            if state not in diff:
                diff[state] = 100
            counts_map[state] = counts_map.get(state, 0) + 1
            weight: float = count_to_weight_func(counts_map.get(state, 0))
            if transition.next_state not in values_map:
                values_map[transition.next_state] = -30

            y = transition.reward + gamma * values_map[transition.next_state]
            diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state])
            values_map[state] = weight * y + (1 - weight) * values_map.get(
                state, 0.)
            count += 1
        elif count >= max_steps or diff[max(
                diff.items(), key=operator.itemgetter(1))[0]] < 1e-4:
            print(diff[max(diff.items(), key=operator.itemgetter(1))[0]])
            break

    return Tabular(values_map, counts_map, count_to_weight_func)
Example #3
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
Example #4
0
def td_lambda_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    lambd: float,
    episode_length: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> Iterator[ValueFunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    curtailed_episodes: Iterable[Iterable[TransitionStep[S]]] = \
        (itertools.islice(episode, episode_length) for episode in episodes)
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
    return td_lambda.td_lambda_prediction(
        traces=curtailed_episodes,
        approx_0=Tabular(
            values_map=initial_vf_dict,
            count_to_weight_func=learning_rate_func
        ),
        γ=gamma,
        lambd=lambd
    )
Example #5
0
    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False
Example #6
0
def td_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    episode_length: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[S, float],
) -> Iterator[FunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = fmrp_episodes_stream(fmrp)
    td_experiences: Iterable[TransitionStep[S]] = unit_experiences_from_episodes(
        episodes, episode_length
    )
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent,
    )
    return td.td_prediction(
        transitions=td_experiences,
        approx_0=Tabular(
            values_map=initial_vf_dict, count_to_weight_func=learning_rate_func
        ),
        γ=gamma,
    )
Example #7
0
    def test_evaluate_mrp(self):
        vf = evaluate(self.mrp_seq, 1.)
        states = self.single_step_mrp.states()
        fa_dynamic = Dynamic({s: 0.0 for s in states})
        fa_tabular = Tabular()
        distribution = Choose(set(states))
        approx_vf_finite = backward_evaluate_finite(
            [(self.mrp_seq[i], fa_dynamic) for i in range(self.steps)],
            1.
        )
        approx_vf = backward_evaluate(
            [(self.single_step_mrp, fa_tabular, distribution)
             for _ in range(self.steps)],
            1.,
            num_state_samples=120,
            error_tolerance=0.01
        )

        for t, (v1, v2, v3) in enumerate(zip(
                vf,
                approx_vf_finite,
                approx_vf
        )):
            states = self.mrp_seq[t].keys()
            v1_arr = np.array([v1[s] for s in states])
            v2_arr = v2.evaluate(states)
            v3_arr = v3.evaluate(states)
            self.assertLess(max(abs(v1_arr - v2_arr)), 0.001)
            self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
Example #8
0
    def test_value_iteration(self):
        vpstar = optimal_vf_and_policy(self.mdp_seq, 1.)
        states = self.single_step_mdp.states()
        fa_dynamic = Dynamic({s: 0.0 for s in states})
        fa_tabular = Tabular()
        distribution = Choose(set(states))
        approx_vpstar_finite = back_opt_vf_and_policy_finite(
            [(self.mdp_seq[i], fa_dynamic) for i in range(self.steps)],
            1.
        )
        approx_vpstar = back_opt_vf_and_policy(
            [(self.single_step_mdp, fa_tabular, distribution)
             for _ in range(self.steps)],
            1.,
            num_state_samples=120,
            error_tolerance=0.01
        )

        for t, ((v1, _), (v2, _), (v3, _)) in enumerate(zip(
                vpstar,
                approx_vpstar_finite,
                approx_vpstar
        )):
            states = self.mdp_seq[t].keys()
            v1_arr = np.array([v1[s] for s in states])
            v2_arr = v2.evaluate(states)
            v3_arr = v3.evaluate(states)
            self.assertLess(max(abs(v1_arr - v2_arr)), 0.001)
            self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
Example #9
0
def mc_finite_prediction_learning_rate(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    episode_length_tolerance: float,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> Iterator[ValueFunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
    return mc.mc_prediction(
        traces=episodes,
        approx_0=Tabular(
            values_map=initial_vf_dict,
            count_to_weight_func=learning_rate_func
        ),
        γ=gamma,
        episode_length_tolerance=episode_length_tolerance
    )
def mc_prediction(episodes_stream: Iterator[Sequence[TransitionStep[S]]],
                  gamma: float, num_episodes: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            mc.mc_prediction(traces=episodes_stream,
                             approx_0=Tabular(),
                             γ=gamma,
                             tolerance=1e-10), num_episodes)).values_map
Example #11
0
def mc_finite_prediction_equal_wts(
        fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float,
        initial_vf_dict: Mapping[S, float]) -> Iterator[FunctionApprox[S]]:
    episodes: Iterable[Iterable[TransitionStep[S]]] = \
        fmrp_episodes_stream(fmrp)
    return mc.mc_prediction(traces=episodes,
                            approx_0=Tabular(values_map=initial_vf_dict),
                            γ=gamma,
                            tolerance=tolerance)
Example #12
0
def mc_prediction(transitions: Iterable[mp.TransitionStep[S]],
                  count_to_weight_func: Callable[[int], float],
                  gamma: float,
                  tolerance: float = 1e-200) -> Tabular[S]:
    '''
    Returns the approximated value
    function after each episode.

    Approximates Tabular MC Prediction with a discrete domain of states S, without any
    interpolation. The value function for each S is maintained as a weighted
    mean of observations by recency (managed by
    `count_to_weight_func').

    In practice, this means you can use this to approximate a function
    with a learning rate α(n) specified by count_to_weight_func.


    Fields:
    values_map -- mapping from S to its approximated value function
    counts_map -- how many times a given S has been updated
    count_to_weight_func -- function for how much to weigh an update
      to S based on the number of times that S has been updated

    Update the value approximation with the given points.
    '''
    values_map: Dict[S, float] = {}
    counts_map: Dict[S, int] = {}
    trace = []
    count = 0
    diff = {}
    max_steps = round(math.log(tolerance) / math.log(gamma))
    print('max steps: ', max_steps)
    # get trace
    for transition in transitions:
        trace.append(transition)
        count += 1
        if count >= max_steps:
            break
    # get corresponding return
    transitions_returns = returns(trace, gamma, tolerance)
    trace_returns = [return_ for return_ in transitions_returns]

    for i in range(len(trace)):
        # x: state; y: return for first n occurrences of x
        x = trace[i].state
        y = trace_returns[i].return_
        if x not in diff:
            diff[x] = 100
        diff[x] = min(abs(y - values_map.get(x, 0.)), diff[x])
        if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 1e-4:
            break
        counts_map[x] = counts_map.get(x, 0) + 1
        weight: float = count_to_weight_func(counts_map.get(x, 0))
        values_map[x] = weight * y + (1 - weight) * values_map.get(x, 0.)
    print(diff[max(diff.items(), key=operator.itemgetter(1))[0]])
    return Tabular(values_map, counts_map, count_to_weight_func)
def td_prediction(experiences_stream: Iterator[TransitionStep[S]],
                  gamma: float, num_experiences: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            td.td_prediction(
                transitions=experiences_stream,
                approx_0=Tabular(count_to_weight_func=learning_rate_schedule(
                    initial_learning_rate=0.01, half_life=10000,
                    exponent=0.5)),
                γ=gamma), num_experiences)).values_map
Example #14
0
def td_lambda_tabular_prediction(
        transitions: Iterable[mp.TransitionStep[S]],
        count_to_weight_func: Callable[[int], float],
        gamma: float,
        lambd: float,
        max_steps: int = 2000,
        tolerance: float = 1e-200) -> Tuple[Tabular[S], int]:
    """
    Similar to TD Scratch except replacing use G_{t,n} for updates
    """
    values_map: Dict[S, float] = {}
    counts_map: Dict[S, int] = {}
    trace = []
    count = 0
    diff = {}  # dict: state and its value error
    for transition in transitions:
        count += 1
        trace.append(transition)
        if count > max_steps:
            break

    # get corresponding return
    transitions_returns = returns(trace, gamma, tolerance)
    trace_returns = [return_ for return_ in transitions_returns]

    for i in range(max_steps):
        transition = trace[i]
        state = transition.state
        if state not in diff:
            diff[state] = 100
        counts_map[state] = counts_map.get(state, 0) + 1
        weight: float = count_to_weight_func(counts_map.get(state, 0))
        if transition.next_state not in values_map:
            values_map[transition.next_state] = -30
        y = lambd**(max_steps - i - 1) * trace_returns[i].return_
        if lambd == 0:
            y = 0
        for n in range(1, max_steps - i):
            g_tn = 0
            for j in range(i, i + n):
                next_transition = trace[j]
                g_tn += gamma**(j - i) * next_transition.reward
                if j == i + n - 1:
                    g_tn += gamma**n * values_map.get(
                        next_transition.next_state, 0)
            y += (1 - lambd) * lambd**(n - 1) * g_tn
        diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state])
        values_map[state] = weight * y + (1 - weight) * values_map.get(
            state, 0.)
        # print(y, values_map[state])
        count += 1
        if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 0.1:
            break
    print(diff[max(diff.items(), key=operator.itemgetter(1))[0]])
    return Tabular(values_map, counts_map, count_to_weight_func), i
Example #15
0
    def test_evaluate_finite_mrp(self):
        start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()})
        traces = self.finite_flip_flop.reward_traces(Choose({True, False}))
        v = iterate.converged(
            mc.evaluate_mrp(traces, γ=0.99, approx_0=start),
            # Loose bound of 0.025 to speed up test.
            done=lambda a, b: a.within(b, 0.025))

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            # Intentionally loose bound—otherwise test is too slow.
            # Takes >1s on my machine otherwise.
            self.assertLess(abs(v(s) - 170), 1.0)
Example #16
0
def glie_mc_finite_control_equal_wts(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    gamma: float,
    epsilon_as_func_of_episodes: Callable[[int], float],
    episode_length_tolerance: float = 1e-5,
) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    return mc.glie_mc_control(
        mdp=fmdp,
        states=Choose(fmdp.non_terminal_states),
        approx_0=Tabular(values_map=initial_qvf_dict),
        γ=gamma,
        ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
        episode_length_tolerance=episode_length_tolerance)
Example #17
0
def td_nbootstrap_tabular_prediction(transitions: Iterable[
    mp.TransitionStep[S]],
                                     count_to_weight_func: Callable[[int],
                                                                    float],
                                     gamma: float,
                                     n: int,
                                     max_steps: int = 5000,
                                     tolerance: float = 1e-10) -> Tabular[S]:
    """
    Similar to TD Scratch except replacing use G_{t,n} for updates
    """
    values_map: Dict[S, float] = {}
    counts_map: Dict[S, int] = {}
    trace = []
    count = 0
    diff = {}  # dict: state and its value error
    for transition in transitions:
        count += 1
        trace.append(transition)
        if count > max_steps + n:
            break
    for i in range(max_steps):
        transition = trace[i]
        state = transition.state
        if state not in diff:
            diff[state] = 100
        counts_map[state] = counts_map.get(state, 0) + 1
        weight: float = count_to_weight_func(counts_map.get(state, 0))
        if transition.next_state not in values_map:
            values_map[transition.next_state] = -10
        y = transition.reward
        for j in range(i + 1, i + n):
            next_transition = trace[j]
            y += gamma**(j - i) * next_transition.reward
            if j == i + n - 1:
                y += gamma**n * values_map.get(next_transition.next_state, 0)
        diff[state] = min(abs(y - values_map.get(state, 0.)), diff[state])
        values_map[state] = weight * y + (1 - weight) * values_map.get(
            state, 0.)
        count += 1
        if diff[max(diff.items(), key=operator.itemgetter(1))[0]] < 1e-4:
            break
    print(diff[max(diff.items(), key=operator.itemgetter(1))[0]])
    return Tabular(values_map, counts_map, count_to_weight_func)
Example #18
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.non_terminal_states
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1
        )

        uniform_policy: FinitePolicy[bool, bool] =\
            FinitePolicy({
                s.state: Choose(self.finite_mdp.actions(s))
                for s in self.finite_mdp.non_terminal_states
            })

        transitions: Iterable[mdp.TransitionStep[bool, bool]] =\
            self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.non_terminal_states),
                uniform_policy
            )

        qs = td.q_learning_external_transitions(
            transitions,
            self.finite_mdp.actions,
            q_0,
            γ=0.99
        )

        q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\
            iterate.last(
                cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]],
                     itertools.islice(qs, 20000))
            )

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [NonTerminal(True), NonTerminal(False)]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
Example #19
0
def glie_sarsa_finite_learning_rate(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        max_episode_length: int) -> Iterator[FunctionApprox[Tuple[S, A]]]:
    initial_qvf_dict: Mapping[Tuple[S, A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return td.glie_sarsa(mdp=fmdp,
                         states=Choose(set(fmdp.non_terminal_states)),
                         approx_0=Tabular(
                             values_map=initial_qvf_dict,
                             count_to_weight_func=learning_rate_func),
                         γ=gamma,
                         ϵ_as_func_of_episodes=epsilon_as_func_of_episodes,
                         max_episode_length=max_episode_length)
Example #20
0
def q_learning_finite_learning_rate(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float, epsilon: float,
        max_episode_length: int) -> Iterator[QValueFunctionApprox[S, A]]:
    initial_qvf_dict: Mapping[Tuple[NonTerminal[S], A],
                              float] = {(s, a): 0.
                                        for s in fmdp.non_terminal_states
                                        for a in fmdp.actions(s)}
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    return td.q_learning(mdp=fmdp,
                         policy_from_q=lambda f, m: mc.epsilon_greedy_policy(
                             q=f, mdp=m, ϵ=epsilon),
                         states=Choose(fmdp.non_terminal_states),
                         approx_0=Tabular(
                             values_map=initial_qvf_dict,
                             count_to_weight_func=learning_rate_func),
                         γ=gamma,
                         max_episode_length=max_episode_length)
Example #21
0

def example_model_data_generator() -> Iterator[DataSeq]:

    coeffs: Aug_Triple = (2., 10., 4., -6.)
    values = np.linspace(-10.0, 10.0, 21)
    pts: Sequence[Triple] = [(x, y, z) for x in values for y in values
                             for z in values]
    d = norm(loc=0., scale=2.0)

    while True:
        res: List[Tuple[Triple, float]] = []
        for pt in pts:
            x_val: Triple = (pt[0], pt[1], pt[2])
            y_val: float = coeffs[0] + np.dot(coeffs[1:], pt) + \
                d.rvs(size=1)[0]
            res.append((x_val, y_val))
        yield res


if __name__ == '__main__':
    training_iterations: int = 30
    data_gen: Iterator[DataSeq] = example_model_data_generator()
    test_data: DataSeq = list(next(data_gen))

    tabular: Tabular[Triple] = Tabular()
    for xy_seq in islice(data_gen, training_iterations):
        tabular = tabular.update(xy_seq)
        this_rmse: float = tabular.rmse(test_data)
        print(f"RMSE = {this_rmse:.3f}")
Example #22
0
                                   poisson_lambda=user_poisson_lambda,
                                   holding_cost=user_holding_cost,
                                   stockout_cost=user_stockout_cost)
    # initialize values_map and count_maps for Tabular
    start_map = {}
    for state in si_mdp.mapping.keys():
        for action in si_mdp.actions(state):
            start_map[(state, action)] = 0
    # start state distribution: every non-terminal state has equal probability to be the start state
    start_states = Categorical({
        state: 1 / len(si_mdp.non_terminal_states)
        for state in si_mdp.non_terminal_states
    })

    mc_tabular_control = mc_control(si_mdp, start_states,
                                    Tabular(start_map, start_map), user_gamma,
                                    800)
    values_map = mc_tabular_control.values_map
    opt_vf, opt_pi = get_optimal_policy(values_map)
    print('opt_vf mc control: \n', opt_vf, '\nopt_pi mc control: \n', opt_pi)

    fdp: FinitePolicy[InventoryState, int] = FinitePolicy({
        InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta))
        for alpha in range(user_capacity + 1)
        for beta in range(user_capacity + 1 - alpha)
    })
    implied_mrp: FiniteMarkovRewardProcess[InventoryState] = \
        si_mdp.apply_finite_policy(fdp)

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
Example #23
0
 si_mrp = SimpleInventoryMRPFinite(
     capacity=user_capacity,
     poisson_lambda=user_poisson_lambda,
     holding_cost=user_holding_cost,
     stockout_cost=user_stockout_cost
 )
 print("Value Function")
 print("--------------")
 si_mrp.display_value_function(gamma=user_gamma)
 print()
 
 states:List[InventoryState] = si_mrp.non_terminal_states
 start_state_distrib: Categorical[InventoryState] = Categorical({i:1 for i in states})
 simulation_episodes = si_mrp.reward_traces(start_state_distrib)
 simulation_transitions = si_mrp.simulate_reward(start_state_distrib)
 approx_0 = Tabular({i : 0 for i in states})
 value_mc = mc_prediction_scratch(
             traces = simulation_episodes,
             states = states,
             γ = user_gamma,
             tolerance = 1e-6,
             num_episodes = 10000
     )
 print("Value Function with our implementation of MC")
 print(value_mc)
 
 value_mc_other = mc_prediction(
             traces = simulation_episodes,
             approx_0 = approx_0,
             γ = user_gamma
     )
Example #24
0
initial_vf_dict: Mapping[NonTerminal[InventoryState], float] = \
    {s: 0. for s in si_mrp.non_terminal_states}

gamma: float = 0.9
lambda_param = 0.3
num_episodes = 10000

episode_length: int = 100
initial_learning_rate: float = 0.03
half_life: float = 1000.0
exponent: float = 0.5

approx_0: Tabular[NonTerminal[InventoryState]] = Tabular(
    values_map=initial_vf_dict,
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
)

episodes: Iterable[Iterable[TransitionStep[InventoryState]]] = \
    si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))
traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        (itertools.islice(episode, episode_length) for episode in episodes)

vf_iter: Iterator[Tabular[NonTerminal[InventoryState]]] = \
    lambda_return_prediction(
        traces=traces,
        approx_0=approx_0,
        γ=gamma,
        lambd=lambda_param
time_decay_half_life: float = 3000
num_updates: int = 10000

q_iter: Iterator[QValueFunctionApprox[InventoryState, int]] = \
    q_learning_experience_replay(
        mdp=si_mdp,
        policy_from_q=lambda f, m: epsilon_greedy_policy(
            q=f,
            mdp=m,
            ϵ=epsilon
        ),
        states=Choose(si_mdp.non_terminal_states),
        approx_0=Tabular(
            count_to_weight_func=learning_rate_schedule(
                initial_learning_rate=initial_learning_rate,
                half_life=learning_rate_half_life,
                exponent=learning_rate_exponent
            )
        ),
        γ=gamma,
        max_episode_length=episode_length,
        mini_batch_size=mini_batch_size,
        weights_decay_half_life=time_decay_half_life
    )

qvf: QValueFunctionApprox[InventoryState, int] = iterate.last(
    itertools.islice(q_iter, num_updates))
vf, pol = get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf)
pprint(vf)
print(pol)
Example #26
0
nt_states: Sequence[NonTerminal[int]] = random_walk.non_terminal_states
start_distribution: NTStateDistribution[int] = Choose(nt_states)
traces: Iterable[Iterable[TransitionStep[int]]] = \
    random_walk.reward_traces(start_distribution)
transitions: Iterable[TransitionStep[int]] = \
    itertools.chain.from_iterable(traces)

td_transitions: Iterable[TransitionStep[int]] = \
    itertools.islice(transitions, num_transitions)

initial_learning_rate: float = 0.5
half_life: float = 1000
exponent: float = 0.5
approx0: Tabular[NonTerminal[int]] = Tabular(
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent))

td_func: Tabular[NonTerminal[int]] = \
    iterate.last(itertools.islice(
        td_prediction(
            transitions=td_transitions,
            approx_0=approx0,
            γ=gamma
        ),
        num_transitions
    ))
td_vf: np.ndarray = td_func.evaluate(nt_states)

num_polynomials: int = 5
Example #27
0
    transition_map = si_mdp.get_action_transition_reward_map()
    # fdp: markov_decision_process.FinitePolicy[InventoryState, int] = markov_decision_process.FinitePolicy(
    #     {InventoryState(alpha, beta):
    #          Constant(user_capacity - (alpha + beta)) for alpha in
    #      range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)}
    # )
    # initialize values_map and count_maps for Tabular
    start_map = {}
    state_action = {}
    for state in si_mdp.mapping.keys():
        state_action[state] = []
        for action in si_mdp.actions(state):
            start_map[(state, action)] = 0
            state_action[state].append(action)

    q = Tabular(start_map, start_map)
    start_states = Categorical({
        state: 1 / len(si_mdp.non_terminal_states)
        for state in si_mdp.non_terminal_states
    })
    # transitions = si_mdp.simulate_actions(start_states, fdp)

    sarsa_tabular_control = sarsa_control(start_states, transition_map,
                                          state_action, q, user_gamma, 0.1)
    diff = {}
    prev = q.values_map
    count = 0
    for fcn_approx in sarsa_tabular_control:
        next = fcn_approx.values_map
        print(fcn_approx.values_map)
        count += 1
Example #28
0
        [(x, f(x) + n.rvs(size=1)[0]) for x in x_pts]

    ff = lambda x: x

    BSApprox = BSplineApprox(feature_function=ff, degree=3)
    solved = BSApprox.solve(xy_vals_seq)
    errors: np.ndarray = solved.evaluate(x_pts) - np.array(
        [y for _, y in xy_vals_seq])
    print("Mean Squared Error")
    print(np.mean(errors * errors))

    print("Indirect Solve")
    BSApprox_ind = BSplineApprox(feature_function=ff,
                                 degree=3,
                                 knots=np.array([0, 1, 2, 3]),
                                 direct_solve=False)
    solved_ind = BSApprox_ind.solve(xy_vals_seq)
    errors: np.ndarray = solved_ind.evaluate(x_pts) - np.array(
        [y for _, y in xy_vals_seq])
    print("Mean Squared Error")
    print(np.mean(errors * errors))
    #The second method takes way more time to converge

    #PROBLEM 2
    print("Solving Problem 2")
    n = 10
    model = LilypadModel(n)
    approx0 = Tabular(values_map={s: 0.0 for s in model.non_terminal_states})
    result = approx_policy_iteration_result(model, 0.9, approx0)
    #Results consistent with what we had before
Example #29
0
            itertools.islice(trace, episode_length) for trace in traces
        )
    num_episodes = 100000

    print("Value Function (TD Function Approximation)")
    print("--------------")
    initial_learning_rate: float = 0.03
    half_life: float = 1000.0
    exponent: float = 0.5
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    td_vfs: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp(
        transitions=unit_experiences_accumulated,
        approx_0=Tabular(count_to_weight_func=learning_rate_func),
        γ=user_gamma)
    final_td_vf: FunctionApprox[InventoryState] = \
        last(itertools.islice(td_vfs, episode_length * num_episodes))
    pprint({s: round(final_td_vf(s), 3) for s in si_mrp.non_terminal_states})
    print()

    print("Value Function (Tabular MC from scratch)")
    print("--------------")
    td_vfs: Iterator[Dict[InventoryState, float]] = evaluate_mrp_dt(
        transitions=unit_experiences_accumulated,
        vf={s: 0
            for s in si_mrp.non_terminal_states},
        γ=user_gamma)
    final_td_vf: Dict[InventoryState, float] = \
        last(itertools.islice(td_vfs, episode_length * num_episodes))
Example #30
0
    si_mrp = SimpleInventoryMRPFinite(capacity=user_capacity,
                                      poisson_lambda=user_poisson_lambda,
                                      holding_cost=user_holding_cost,
                                      stockout_cost=user_stockout_cost)

    print("Value Function (Exact)")
    print("--------------")
    si_mrp.display_value_function(gamma=user_gamma)
    print()

    print("Value Function (MC Function Approximation)")
    print("--------------")
    traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states)))
    it: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp(
        traces=traces, approx_0=Tabular(), γ=user_gamma)
    num_traces = 10000
    last_vf_mc: FunctionApprox[InventoryState] = last(islice(it, num_traces))
    pprint({
        s: round(last_vf_mc.evaluate([s])[0], 3)
        for s in si_mrp.non_terminal_states
    })
    print()

    print("Value Function (Tabular MC from scratch)")
    print("--------------")
    traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states)))
    it: Iterator[Dict[InventoryState, float]] = evaluate_mrp_mc(
        traces=traces,
        vf={s: 0