def setUp(self): user_capacity = 2 user_poisson_lambda = 1.0 user_holding_cost = 1.0 user_stockout_cost = 10.0 self.gamma = 0.9 self.si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost ) self.fdp: FinitePolicy[InventoryState, int] = FinitePolicy({ InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta)) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha) }) self.implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\ self.si_mdp.apply_finite_policy(self.fdp) self.states: Sequence[InventoryState] = \ self.implied_mrp.non_terminal_states
def test_flip_flop(self): trace = list( itertools.islice( self.flip_flop.simulate(Constant(NonTerminal(True))), 10)) self.assertTrue( all(isinstance(outcome.state, bool) for outcome in trace)) longer_trace = itertools.islice( self.flip_flop.simulate(Constant(NonTerminal(True))), 10000) count_trues = len( list(outcome for outcome in longer_trace if outcome.state)) # If the code is correct, this should fail with a vanishingly # small probability self.assertTrue(1000 < count_trues < 9000)
def get_opt_vf_and_policy(self) -> \ Iterator[Tuple[V[int], FinitePolicy[int, bool]]]: dt: float = self.dt() up_factor: float = np.exp(self.vol * np.sqrt(dt)) up_prob: float = (np.exp(self.rate * dt) * up_factor - 1) / \ (up_factor * up_factor - 1) return optimal_vf_and_policy( steps=[ {j: None if j == -1 else { True: Constant( ( -1, self.payoff(i * dt, self.state_price(i, j)) ) ), False: Categorical( { (j + 1, 0.): up_prob, (j, 0.): 1 - up_prob } ) } for j in range(i + 1)} for i in range(self.num_steps + 1) ], gamma=np.exp(-self.rate * dt) )
def setUp(self): ii = 12 self.steps = 8 pairs = [(1.0, 0.5), (0.7, 1.0), (0.5, 1.5), (0.3, 2.5)] self.cp: ClearancePricingMDP = ClearancePricingMDP( initial_inventory=ii, time_steps=self.steps, price_lambda_pairs=pairs) def policy_func(x: int) -> int: return 0 if x < 2 else (1 if x < 5 else (2 if x < 8 else 3)) stationary_policy: FinitePolicy[int, int] = FinitePolicy( {s: Constant(policy_func(s)) for s in range(ii + 1)}) self.single_step_mrp: FiniteMarkovRewardProcess[ int] = self.cp.single_step_mdp.apply_finite_policy( stationary_policy) self.mrp_seq = unwrap_finite_horizon_MRP( finite_horizon_MRP(self.single_step_mrp, self.steps)) self.single_step_mdp: FiniteMarkovDecisionProcess[ int, int] = self.cp.single_step_mdp self.mdp_seq = unwrap_finite_horizon_MDP( finite_horizon_MDP(self.single_step_mdp, self.steps))
def get_q_learning_vf_and_policy( self, states_actions_dict: Mapping[Cell, Optional[Set[Move]]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01, epsilon: float = 0.1) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-block cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items() if actions is not None} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): state: Cell = uniform_states.sample() ''' write your code here update the dictionary q initialized above according to the Q-learning algorithm's Q-Value Function updates. ''' vf_dict: V[Cell] = {s: max(d.values()) for s, d in q.items()} policy: FinitePolicy[Cell, Move] = FinitePolicy({ s: Constant(max(d.items(), key=itemgetter(1))[0]) for s, d in q.items() }) return (vf_dict, policy)
def reward_MRP_simulation( alpha: float, gamma: float, num_time: int, stock_MP3: StockPriceMP3, init_price: float, f: Callable[[float], float]) -> List[Tuple[StateMP3, float]]: """ Simulate reward from MRP stock price model for fixed time interval. :param alpha: Reverse-pull strength in stock model :param gamma: discount factor :param num_time: Number of time steps during which to record simulation :param stock_MP3: Markov process representation of the stock price model :param init_price: Initial stock price :param f: Function computing reward at time t from state at time t :returns: List of (state, reward) tuples obtained by MRP simulation """ stock_MRP3: StockPriceMRP3 = StockPriceMRP3(alpha, stock_MP3, init_price, f) start: Constant = Constant(StateMP3(0, 0)) return [ (step.next_state, step.reward * gamma ** (t+1)) \ for t, step in enumerate( itertools.islice( stock_MRP3.simulate_reward(start), num_time+1 ) ) ]
def main(num_pads): # 2^(num_pads-2) deterministic policies fc_mdp: FiniteMarkovDecisionProcess[FrogState, Any] = FrogCroak(num_pads + 1) all_fp = list(itertools.product(['A', 'B'], repeat=fc_mdp.num_pads - 2)) all_mrp_value = [] for fp in all_fp: fdp: FinitePolicy[FrogState, Any] = FinitePolicy( {FrogState(i + 1): Constant(fp[i]) for i in range(len(fp))}) implied_mrp: FiniteMarkovRewardProcess[ FrogState] = fc_mdp.apply_finite_policy(fdp) all_mrp_value.append(implied_mrp.get_value_function_vec(1)) # find the optimized policy max_indices = [] value_matrix = np.array(all_mrp_value) for i in range(num_pads - 1): max_indices.append(np.argmax(value_matrix[:, i])) max_index = list(set(max_indices))[0] print(value_matrix[max_index, :]) print(all_fp[max_index]) plt.plot([ 'State' + str(i + 1) + ',' + all_fp[max_index][i] for i in range(num_pads - 1) ], value_matrix[max_index, :], 'o') plt.xlabel('Frog State') plt.ylabel('Probability') plt.title('n = ' + str(num_pads - 1)) plt.show()
def act(self, state: State) -> Optional[Distribution[Action]]: delta_b:float = (2*state.I+1)*self.gamma*self.sigma**2*\ (self.T-state.t)/2+1/self.gamma*np.log(1+self.gamma/self.k) delta_a:float = (-2*state.I+1)*self.gamma*self.sigma**2*\ (self.T-state.t)/2+1/self.gamma*np.log(1+self.gamma/self.k) Pb:float = state.S-delta_b Pa:float = state.S+delta_a return Constant(Action(Pb = Pb,Pa = Pa))
def get_mapping(self) -> StateActionMapping[State, Action]: #We need to define the StateActionMapping for this Finite MDP mapping: StateActionMapping[State, Action] = {} list_actions: List[Action] = [] #We start by defining all the available actions for i in range(self.H + 1): range_j = self.H - i for j in range(range_j + 1): list_actions.append(Action(i, j)) self.list_actions: List[Action] = list_actions list_states: List[State] = [] #Then we define all the possible states for i in range(1, self.W + 1): list_states.append(State(i)) self.list_states: List[State] = list_states for state in list_states: submapping: ActionMapping[Action, StateReward[State]] = {} for action in list_actions: s: int = action.s l: int = action.l reward: float = state.wage * (self.H - l - s) pois_mean: float = self.alpha * l proba_offer: float = self.beta * s / self.H if state.wage == self.W: #If you're in state W, you stay in state W with constant #Probability. The reward only depends on the action you #you have chosen submapping[action] = Constant((state, reward)) elif state.wage == self.W - 1: #If you're in state W-1, you can either stay in your state #or land in state W submapping[action] = Categorical({ (state, reward): poisson.pmf(0,pois_mean)*(1-proba_offer), (State(self.W), reward):proba_offer+(1-proba_offer)*\ (1-poisson.pmf(0,pois_mean)) }) else: #If you're in any other state, you can land to any state #Between your current state and W with probabilities #as described before dic_distrib = {} dic_distrib[(state, reward)] = poisson.pmf( 0, pois_mean) * (1 - proba_offer) dic_distrib[ (State(state.wage+1), reward)] = proba_offer*poisson.cdf(1,pois_mean)\ +(1-proba_offer)*poisson.pmf(1,pois_mean) for k in range(2, self.W - state.wage): dic_distrib[(State(state.wage + k), reward)] = poisson.pmf(k, pois_mean) dic_distrib[(State(self.W), reward)] = 1 - poisson.cdf( self.W - state.wage - 1, pois_mean) submapping[action] = Categorical(dic_distrib) mapping[state] = submapping return mapping
def process_traces(time_steps: int, num_traces: int, game: SnakesAndLaddersGame) -> np.array: start_state_distribution = Constant(StateSnakeAndLadder(position=1)) array_length = [] for i in range(num_traces): new_val = np.fromiter((s.position for s in itertools.islice( game.simulate(start_state_distribution), time_steps + 1)), float) array_length += [len(new_val)] return np.array(array_length)
def test_flip_flop(self): trace = list( itertools.islice(self.flip_flop.simulate_reward(Constant(True)), 10)) self.assertTrue(all(isinstance(outcome, bool) for outcome, _ in trace)) cumulative_reward = sum(reward for _, reward in trace) self.assertTrue(0 <= cumulative_reward <= 10)
def process1_price_traces(start_price: int, level_param: int, alpha1: float, time_steps: int, num_traces: int) -> np.ndarray: mp = StockPriceMP1(level_param=level_param, alpha1=alpha1) start_state_distribution = Constant(StateMP1(price=start_price)) return np.vstack([ np.fromiter((s.price for s in itertools.islice( mp.simulate(start_state_distribution), time_steps + 1)), float) for _ in range(num_traces) ])
def test_choose(self): assert_almost_equal(self, self.one, Constant(1)) self.assertAlmostEqual(self.one.probability(1), 1.) self.assertAlmostEqual(self.one.probability(0), 0.) categorical_six = Categorical({x: 1 / 6 for x in range(1, 7)}) assert_almost_equal(self, self.six, categorical_six) self.assertAlmostEqual(self.six.probability(1), 1 / 6) self.assertAlmostEqual(self.six.probability(0), 0.)
def act(self, s: S) -> Optional[Distribution[A]]: if mdp.is_terminal(s): return None if explore.sample(): return Choose(set(mdp.actions(s))) _, action = q.argmax((s, a) for a in mdp.actions(s)) return Constant(action)
def get_all_deterministic_policies(self) -> Sequence[FinitePolicy[LilypadState, str]]: bin_to_act = {'0':'A', '1':'B'} all_action_comb = self.get_all_action_combinations() all_policies = [] for action_comb in all_action_comb: policy: FinitePolicy[LilypadState,str] = FinitePolicy( {LilypadState(i+1): Constant(bin_to_act[a]) for i,a in enumerate(action_comb)} ) all_policies.append(policy) return all_policies
def act(self, state: S) -> Constant[A]: return Constant( max( ( (mdp.step(state, a).expectation(return_), a) for a in mdp.actions(state) ), key=itemgetter(0), )[1] )
def test_flip_flop(self): trace = list( itertools.islice(self.flip_flop.simulate_reward(Constant(True)), 10)) self.assertTrue( all(isinstance(step.next_state, bool) for step in trace)) cumulative_reward = sum(step.reward for step in trace) self.assertTrue(0 <= cumulative_reward <= 10)
def process2_price_traces(start_price: int, alpha2: float, time_steps: int, num_traces: int) -> np.ndarray: mp = StockPriceMP2(alpha2=alpha2) start_state_distribution = Constant( StateMP2(price=start_price, is_prev_move_up=None)) return np.vstack([ np.fromiter((s.price for s in itertools.islice( mp.simulate(start_state_distribution), time_steps + 1)), float) for _ in range(num_traces) ])
def act(self, state: S) -> Constant[A]: return Constant( max( ( (res.expectation(return_), a) for a, res in step[state].items() ), key=itemgetter(0), )[1] )
def test_optimal_policy(self): finite = finite_horizon_MDP(self.finite_flip_flop, limit=10) steps = unwrap_finite_horizon_MDP(finite) *v_ps, (v, p) = optimal_vf_and_policy(steps, gamma=1) for s in p.states(): self.assertEqual(p.act(s), Constant(False)) self.assertAlmostEqual(v_ps[0][0][True], 17) self.assertAlmostEqual(v_ps[5][0][False], 17 / 2)
def get_opt_vf_from_q(q_value:Mapping[Tuple[S,A],float])\ ->Tuple[Mapping[S,float],FinitePolicy[S,A]]: v: Mapping[S, float] = {} policy_map: Mapping[S, Optional[Constant[A]]] = {} for i in q_value: state, action = i if state not in v.keys() or q_value[i] > v[state]: v[state] = q_value[i] policy_map[state] = Constant(action) Pi = FinitePolicy(policy_map) return (v, Pi)
def get_policies(n)->Iterable[FinitePolicy[StatePond,Action]]: list_policies: Iterable[FinitePolicy[StatePond,Action]] = [] liste_actions:list = list(itertools.product(['A','B'],repeat=n-1)) for i in liste_actions: policy_map: Mapping[StatePond, Optional[FiniteDistribution[Action]]] = {} policy_map[StatePond(0)] = None policy_map[StatePond(n)] = None for j in range(0,n-1): policy_map[StatePond(j+1)] = Constant(Action(i[j])) list_policies+=[FinitePolicy(policy_map)] return list_policies
def process3_price_traces(start_price: int, alpha3: float, time_steps: int, num_traces: int) -> np.ndarray: mp = StockPriceMP3(alpha3=alpha3) start_state_distribution = Constant( StateMP3(num_up_moves=0, num_down_moves=0)) return np.vstack([ np.fromiter( (start_price + s.num_up_moves - s.num_down_moves for s in itertools.islice(mp.simulate(start_state_distribution), time_steps + 1)), float) for _ in range(num_traces) ])
def get_vf_and_policy_from_qvf( mdp: FiniteMarkovDecisionProcess[S, A], qvf: FunctionApprox[Tuple[S, A]]) -> Tuple[V[S], FinitePolicy[S, A]]: opt_vf: V[S] = { s: max(qvf((s, a)) for a in mdp.actions(s)) for s in mdp.non_terminal_states } opt_policy: FinitePolicy[S, A] = FinitePolicy({ s: Constant(qvf.argmax((s, a) for a in mdp.actions(s))[1]) for s in mdp.non_terminal_states }) return opt_vf, opt_policy
def step(self, state: float, action: bool) -> SampledDistribution[Tuple[float, float]]: if action: return Constant((state, payoffs(state))) else: def sr_sampler_func(state=state, action=action) -> Tuple[float, float]: next_state_price: float = asset_distribution.sample() reward: float = 0 return (next_state_price, reward) return SampledDistribution(sampler=sr_sampler_func, expectation_samples=1000)
def frog_problem_traces(num_lilypads: int, n_traces: int) -> np.ndarray: """Simulate frog problem to predict expected hops required to cross river. In each simulation, the frog starts on a riverbank, so the starting state will always be equal to the total number of lilypads in the simulation. :param num_lilypads: Number of lilypads between riverbanks :param n_traces: Number of traces to generate :return: Hopping counts required to cross the river obtained from traces """ frog_problem_sim = FrogProblemMPFinite(n_lilypads=num_lilypads) start_state = Constant(FrogState(num_lilypads)) return np.fromiter((len(list(trace)) for trace in itertools.islice( frog_problem_sim.traces(start_state), n_traces + 1)), int)
def greedy_policy_from_vf(mdp: FiniteMarkovDecisionProcess[S, A], vf: V[S], gamma: float) -> FinitePolicy[S, A]: greedy_policy_dict: Dict[S, FiniteDistribution[A]] = {} for s in mdp.non_terminal_states: q_values: Iterator[Tuple[A, float]] = \ ((a, mdp.mapping[s][a].expectation( lambda s_r: s_r[1] + gamma * vf.get(s_r[0], 0.) )) for a in mdp.actions(s)) greedy_policy_dict[s] =\ Constant(max(q_values, key=operator.itemgetter(1))[0]) return FinitePolicy(greedy_policy_dict)
def get_action_transition_reward_map(self, maze: Maze): d: Dict[GridState, Dict[str, Categorical[Tuple[GridState, float]]]] = {} for x in range(maze.nx): for y in range(maze.ny): state = GridState(x, y) if state != self.goal: d1: Dict[str, Categorical[Tuple[GridState, float]]] = {} cell = maze.cell_at(x, y) for move, next_cell in maze.find_valid_neighbours(cell): if not cell.has_wall_at(move): next_state = GridState(next_cell.x, next_cell.y) d1[move] = Constant( (next_state, self.reward_func(next_state))) d[state] = d1 return d
def fraction_of_days_oos(self, policy: Policy[InventoryState, int], time_steps: int, num_traces: int) -> float: impl_mrp: MarkovRewardProcess[InventoryState] =\ self.apply_policy(policy) count: int = 0 high_fractile: int = int(poisson(self.poisson_lambda).ppf(0.98)) start: InventoryState = random.choice( [InventoryState(i, 0) for i in range(high_fractile + 1)]) for _ in range(num_traces): steps = itertools.islice(impl_mrp.simulate_reward(Constant(start)), time_steps) for step in steps: if step.reward < -self.holding_cost * step.state.on_hand: count += 1 return float(count) / (time_steps * num_traces)
def step(self, state: Tuple[int, float], action: bool) -> SampledDistribution[Tuple[int, float]]: if state[0] > expiry_time or state[0] == -1: return None elif action: return Constant(((-1, state[1]), payoffs(state[1]))) else: def sr_sampler_func( state=state, action=action) -> Tuple[Tuple[int, float], float]: next_state_price: float = asset_distribution.sample() next_state_time = state[0] + 1 reward: float = 0 return ((next_state_time, next_state_price), reward) return SampledDistribution(sampler=sr_sampler_func, expectation_samples=1000)