def main(): """Run the prediction algorithms on the vampire problem. """ from pprint import pprint # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical({ vampire.State(i, True): 1 / num_villagers for i in range(1, num_villagers + 1) }) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal value function: ") pprint(true_val) # Apply Tabular MC prediction to approximate optimal value function vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\ vampire_mdp.apply_finite_policy(pi) num_traces = 1000000 traces = get_traces(vampire_mrp, start_state_dist, num_traces) pred_val_mc = tabular_mc_prediction(traces, 1) print("Predicted value function by MC prediction: ") pprint(pred_val_mc) # Apply Tabular TD prediction to approximate optimal value function atomic_experiences = [step for trace in traces for step in trace] pred_val_td = tabular_td_prediction(atomic_experiences, 0.0001, 1) print("Predicted value function by TD prediction: ") pprint(pred_val_td)
def main(): """Run the control algorithms. Test the control algorithms using the `Vampire Problem` MDP. """ # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical({ vampire.State(i, True): 1 / num_villagers for i in range(1, num_villagers + 1) }) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal policy: ") print(pi) print() print("True optimal value function: ") pprint(true_val) # Apply tabular MC control to obtain the optimal policy and value function pred_action_val, pred_pi = tabular_mc_control(vampire_mdp, 1, start_state_dist, 10000) print("Predicted optimal policy: ") for i in range(1, num_villagers + 1): print("Num Villagers: " + str(i) + "; Vampire Alive: True") print(pred_pi.act((vampire.State(i, True)))) print() print("Predicted optimal action-value function: ") print_if_optimal(pred_action_val, pred_pi) # Apply tabular SARSA to obtain the optimal policy and value function pred2_action_val, pred2_pi = tabular_sarsa(vampire_mdp, 1, start_state_dist, 10000) print("Predicted optimal policy: ") for i in range(1, num_villagers + 1): print("Num Villagers: " + str(i) + "; Vampire Alive: True") print(pred2_pi.act((vampire.State(i, True)))) print() print("Predicted optimal action-value function: ") print_if_optimal(pred2_action_val, pred2_pi) # Apply tabular Q-learning to obtain the optimal policy and value function pred3_action_val, pred3_pi = tabular_qlearning(vampire_mdp, 1, start_state_dist, 100000) print("Predicted optimal policy: ") for i in range(1, num_villagers + 1): print("Num Villagers: " + str(i) + "; Vampire Alive: True") print(pred3_pi.act((vampire.State(i, True)))) print() print("Predicted optimal action-value function: ") print_if_optimal(pred3_action_val, pred3_pi)
def main(): """Run the prediction algorithms on the vampire problem. """ from pprint import pprint # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical( { vampire.State( i, True ): 1 / num_villagers for i in range(1, num_villagers+1) } ) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal value function: ") pprint(true_val) # Express the vampire problem as an MRP and sample traces vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\ vampire_mdp.apply_finite_policy(pi) num_traces = 100000 traces = get_traces(vampire_mrp, start_state_dist, num_traces) # Apply tabular TD-lambda to approximate optimal value function pred_val_td_lambda, _ = tabular_TD_lambda( traces=traces, learning_rate=get_learning_rate, lambda_param=0.5, gamma=1 ) print("Predicted value function by TD-lambda prediction: ") print_non_terminal_vampire_states(pred_val_td_lambda) # Apply tabular n-step boostrap to predict optimal value function pred_val_n_step, _ = tabular_n_step_bootstrap( traces=traces, learning_rate=get_learning_rate, n_step=3, gamma=1 ) print("Predicted value function by tabular n-step prediction: ") print_non_terminal_vampire_states(pred_val_n_step) # Plot Convergence of VF prediction by TD-lambda at various lambdas run_tabular_td_lambda( traces=traces, learning_rate=get_learning_rate, lambda_param=[0, 0.25, 0.5, 0.75, 0.99], gamma=1 )
def main(): """Test the LSTD algorithm on the Vampire problem MDP. """ from pprint import pprint # Specify a starting state distribution for the number of villagers num_villagers: int = 10 start_state_dist: dist.Categorical[S] = dist.Categorical( { vampire.State( i, True ): 1 / num_villagers for i in range(1, num_villagers+1) } ) # Represent the problem as an MDP vampire_mdp: mdp.FiniteMarkovDecisionProcess[S, A] =\ vampire.VampireMDP(num_villagers) # Use dynamic programming to obtain the optimal value function and policy true_val, pi = dp.policy_iteration_result(vampire_mdp, 1) print("True optimal value function: ") pprint(true_val) # Express the vampire problem as an MRP and sample experiences vampire_mrp: mp.FiniteMarkovRewardProcess[S] =\ vampire_mdp.apply_finite_policy(pi) num_traces = 10000 traces = get_traces(vampire_mrp, start_state_dist, num_traces) experiences = [trace[i] for trace in traces for i in range(len(trace))] # Generate feature vector, weights, and approx VF for non-terminal states vf = {} weights = LSTD(feature_functions, experiences, 1) for i in range(1, num_villagers+1): vampire_state = vampire.State(n=i, v=True) vf[vampire_state] = np.matmul( get_feature_vec(feature_functions, vampire_state), weights )[0] print("Predicted optimal value function: ") pprint(vf) # Generate a random set of atomic experiences from random policies random_experiences = get_traces_over_random_actions( vampire_mdp, start_state_dist, 10000 ) lstdq_weights = LSTDQ( action_feature_funcs, random_experiences, 1 ) print(lstdq_weights)
def process_time(n,gamma = 1) -> Tuple[float,float,float]: print(f"n={n}") model = LilypadModel(n) start = time.time() list_policies = get_policies(n) optimal_policy,list_sum,list_values,idx_max = get_optimal_policy(n,model,list_policies,gamma = gamma) time_brute = time.time()-start start_2 = time.time() value_iter = value_iteration_result(model,1) time_value_iter = time.time() - start_2 start_3 = time.time() policy_iter = policy_iteration_result(model,1) time_policy_iter = time.time() - start_3 return time_brute,time_value_iter,time_policy_iter
for state in si_mdp.non_terminal_states }) mc_tabular_control = mc_control(si_mdp, start_states, Tabular(start_map, start_map), user_gamma, 800) values_map = mc_tabular_control.values_map opt_vf, opt_pi = get_optimal_policy(values_map) print('opt_vf mc control: \n', opt_vf, '\nopt_pi mc control: \n', opt_pi) fdp: FinitePolicy[InventoryState, int] = FinitePolicy({ InventoryState(alpha, beta): Constant(user_capacity - (alpha + beta)) for alpha in range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha) }) implied_mrp: FiniteMarkovRewardProcess[InventoryState] = \ si_mdp.apply_finite_policy(fdp) print("MDP Value Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma) print(opt_vf_vi, '\n') print(opt_policy_vi) print("MDP Policy Iteration Optimal Value Function and Optimal Policy") print("--------------") opt_vf_pi, opt_policy_pi = policy_iteration_result(si_mdp, gamma=user_gamma) print(opt_vf_pi, '\n') print(opt_policy_pi)
""" d: Dict[State, Dict[Action, Categorical[Tuple[State, float]]]] = {} # Specify terminal state transition probabilies for i in range(self.init_pop + 1): d[State(i, False)] = None d[State(0, True)] = None # Specify non-terminal state transition probabilities for i in range(1, self.init_pop + 1): state: State = State(i, True) d1: Dict[Action, Categorical[Tuple[State, float]]] = {} for p in range(i): sr_probs_dict: Dict[Tuple[State, float], float] = {} sr_probs_dict[(State(i - p, False), i - p)] = p / i sr_probs_dict[(State(i - p - 1, True), 0)] = 1 - p / i d1[Action(p)] = Categorical(sr_probs_dict) d[state] = d1 return d if __name__ == "__main__": from pprint import pprint vampire_mdp = VampireMDP(100) val, pi = dp.policy_iteration_result(vampire_mdp, 1) pprint(pi) pprint(val)
y_brute = [] y_pi = [] y_vi = [] for number_of_lilypads in x: frog_mdp: FrogEscapeMDP = FrogEscapeMDP(number_of_lilypads) all_det_policies: Sequence[FinitePolicy[LilypadState, str]] = frog_mdp.get_all_deterministic_policies() t1 = time.time() optimal_det_policy, optimal_value_fun = frog_mdp.get_optimal(all_det_policies) t2 = time.time() time_brute_force = t2 - t1 y_brute.append(time_brute_force) # Policy Iteration t1 = time.time() opt_vf_pi, opt_policy_pi = policy_iteration_result(frog_mdp, gamma=1) t2 = time.time() time_policy_iter = t2 - t1 y_pi.append(time_policy_iter) #pprint(opt_vf_pi) #print(opt_policy_pi) # Value Iteration t1 = time.time() opt_vf_pi, opt_policy_pi = value_iteration_result(frog_mdp, gamma=1) t2 = time.time() time_value_iter = t2 - t1 y_vi.append(time_value_iter) #pprint(opt_vf_pi) #print(opt_policy_pi)
user_stockout_cost2 = 15.0 store1: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost ) store2: FiniteMarkovDecisionProcess[InventoryState, int] =\ SimpleInventoryMDPCap( capacity=user_capacity2, poisson_lambda=user_poisson_lambda2, holding_cost=user_holding_cost2, stockout_cost=user_stockout_cost2 ) K1 = 1 K2 = 1 problem4 = ComplexMDP(store1 = store1, store2 = store2, K1 = K1, K2 = K2 ) value_opt = value_iteration_result(problem4,user_gamma) policy_opt = policy_iteration_result(problem4,user_gamma)