def setUp(self):
        user_capacity = 2
        user_poisson_lambda = 1.0
        user_holding_cost = 1.0
        user_stockout_cost = 10.0

        self.gamma = 0.9

        self.si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
            SimpleInventoryMDPCap(
                capacity=user_capacity,
                poisson_lambda=user_poisson_lambda,
                holding_cost=user_holding_cost,
                stockout_cost=user_stockout_cost
            )

        self.fdp: FinitePolicy[InventoryState, int] = FinitePolicy({
            InventoryState(alpha, beta):
            Constant(user_capacity - (alpha + beta))
            for alpha in range(user_capacity + 1)
            for beta in range(user_capacity + 1 - alpha)
        })

        self.implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
            self.si_mdp.apply_finite_policy(self.fdp)

        self.states: Sequence[InventoryState] = \
            self.implied_mrp.non_terminal_states
Beispiel #2
0
                                                           not in opt_vf):
            opt_vf[state] = value
            opt_pi[state] = action
    return opt_vf, opt_pi


if __name__ == '__main__':
    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0

    user_gamma = 0.9

    si_mdp = SimpleInventoryMDPCap(capacity=user_capacity,
                                   poisson_lambda=user_poisson_lambda,
                                   holding_cost=user_holding_cost,
                                   stockout_cost=user_stockout_cost)
    # initialize values_map and count_maps for Tabular
    start_map = {}
    for state in si_mdp.mapping.keys():
        for action in si_mdp.actions(state):
            start_map[(state, action)] = 0
    # start state distribution: every non-terminal state has equal probability to be the start state
    start_states = Categorical({
        state: 1 / len(si_mdp.non_terminal_states)
        for state in si_mdp.non_terminal_states
    })

    mc_tabular_control = mc_control(si_mdp, start_states,
                                    Tabular(start_map, start_map), user_gamma,
                                    800)
Beispiel #3
0
#             (transition.state, transition.action), transition.reward + γ * q((transition.next_state, next_transition.action))])
#         policy = markov_decision_process.policy_from_q(q, mdp, 1 / count)
#         transition = next_transition
#         print(count, q)
#     return q

if __name__ == '__main__':
    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0

    user_gamma = 0.9

    si_mdp = SimpleInventoryMDPCap(capacity=user_capacity,
                                   poisson_lambda=user_poisson_lambda,
                                   holding_cost=user_holding_cost,
                                   stockout_cost=user_stockout_cost)
    transition_map = si_mdp.get_action_transition_reward_map()
    # fdp: markov_decision_process.FinitePolicy[InventoryState, int] = markov_decision_process.FinitePolicy(
    #     {InventoryState(alpha, beta):
    #          Constant(user_capacity - (alpha + beta)) for alpha in
    #      range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)}
    # )
    # initialize values_map and count_maps for Tabular
    start_map = {}
    state_action = {}
    for state in si_mdp.mapping.keys():
        state_action[state] = []
        for action in si_mdp.actions(state):
            start_map[(state, action)] = 0
            state_action[state].append(action)
Beispiel #4
0
    print("Solving Problem 4")
    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0
    user_gamma = 0.9
    
    user_capacity2 = 3
    user_poisson_lambda2 = 0.9
    user_holding_cost2 = 1.5
    user_stockout_cost2 = 15.0

    store1: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity,
            poisson_lambda=user_poisson_lambda,
            holding_cost=user_holding_cost,
            stockout_cost=user_stockout_cost
        ) 
    store2: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity2,
            poisson_lambda=user_poisson_lambda2,
            holding_cost=user_holding_cost2,
            stockout_cost=user_stockout_cost2
        )
    K1 = 1
    K2 = 1
    problem4 = ComplexMDP(store1 = store1,
                          store2 = store2,
                          K1 = K1,
                          K2 = K2
Beispiel #5
0
        epsilon = max(abs(qvf[s][a] - qvf_old[s][a]) for s in qvf for a in qvf[s])
        qvf_old = deepcopy(qvf)
        yield qvf



if __name__ == '__main__':
    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0
    user_gamma = 0.9

    si_mdp = SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )

    transition_map = si_mdp.get_action_transition_reward_map()

    def transition_function(s, a):
        return transition_map[s][a]

    states = si_mdp.states()
    non_terminal = si_mdp.non_terminal_states
    actions = {s: list(si_mdp.actions(s)) for s in non_terminal}
    q_0 = {s: {a: 0 for a in actions[s]} for s in non_terminal}

    qvf_iter = find_qvf_mc_control(
        states=states,
from rl.chapter11.control_utils import get_vf_and_policy_from_qvf
from rl.monte_carlo import epsilon_greedy_policy
from rl.td import q_learning_experience_replay
from rl.dynamic_programming import value_iteration_result
import rl.iterate as iterate
import itertools
from pprint import pprint

capacity: int = 2
poisson_lambda: float = 1.0
holding_cost: float = 1.0
stockout_cost: float = 10.0

si_mdp: SimpleInventoryMDPCap = SimpleInventoryMDPCap(
    capacity=capacity,
    poisson_lambda=poisson_lambda,
    holding_cost=holding_cost,
    stockout_cost=stockout_cost)

gamma: float = 0.9
epsilon: float = 0.3

initial_learning_rate: float = 0.1
learning_rate_half_life: float = 1000
learning_rate_exponent: float = 0.5

episode_length: int = 100
mini_batch_size: int = 1000
time_decay_half_life: float = 3000
num_updates: int = 10000