def incrementally_improve_V(vf, threshold, dir_points): is_multiple_v = False value_functions = [[]] value_functions[0].append(vf[0]) for v in range(max_demand+1,0,-1): value_functions[0].append(vf[-v]) #this loop iterates incrementally with the latest value function #to further improve upon for i in range(num_iterations_for_vf): #print("iterative vf",value_functions) threshold = np.zeros((tuple_size, action_count)) rmdp = crobust.MDP(0, discount_factor) pos=0 for s in range(mdp.state_count()): actions = mdp.action_count(s) for a in range(actions): if len(mdp.get_toids(s,a))==0: continue threshold[0,pos] = s threshold[1,pos] = a threshold[2,pos] = construct_rmdp(s, a, value_functions, rmdp, dir_points, is_multiple_v) pos += 1 #print("MDP: ",rmdp.to_json()) sol = rmdp.rsolve_vi("robust_l1".encode(),threshold) vf = sol.valuefunction value_functions = [[]] #As S,s policy is used as random policy to generate samples, #the possible inventory levels are, o & (max_inventory - max_demand) #to max_inventory, inclusive. So filter out the value functions #for possible states. value_functions[0].append(vf[0]) for v in range(max_demand+1,0,-1): value_functions[0].append(vf[-v]) return value_functions[0][0] #initial states value of 0th value function
def randomly_improve_V(vf, threshold, dir_points): is_multiple_v = True value_functions = [[]] value_functions[0].append(vf[0]) for s in range(max_demand+1,0,-1): value_functions[0].append(vf[-s]) #this loop iterates incrementally with the latest value function #to further improve upon for i in range(1,num_iterations_for_vf+1,1): #print("random vf",value_functions) threshold = np.zeros((tuple_size, action_count)) #value_functions.append(np.random.randint(10, size=(max_demand-min_demand+2))) rmdp = crobust.MDP(0, discount_factor) pos=0 print("num iteration----",i) for s in range(mdp.state_count()): actions = mdp.action_count(s) for a in range(actions): if len(mdp.get_toids(s,a))==0: continue threshold[0,pos] = s threshold[1,pos] = a threshold[2,pos] = construct_rmdp(s, a, value_functions, rmdp, dir_points, is_multiple_v) pos += 1 #print("MDP: ",rmdp.to_json()) sol = rmdp.rsolve_vi("robust_l1".encode(),threshold) vf = sol.valuefunction value_functions.append([]) value_functions[i].append(vf[0]) for s in range(max_demand+1,0,-1): value_functions[i].append(vf[-s]) return vf[0]
for s in range(num_states): # action left transitions[s, 0, max(s - 1, 0)] = 1 # action right transitions[s, 1, s] = 0.4 if s == 0 else 0.6 transitions[s, 1, min(s + 1, num_states - 1)] = 0.6 if s == 0 else (0.6 if s == num_states - 1 else 0.35) transitions[s, 1, max(s - 1, 0)] = 0.4 if s == 0 else (0.4 if s == num_states - 1 else 0.05) true_mdp = crobust.MDP(0, discount_factor) for s in range(num_states): true_mdp.add_transition(s, 0, max(s - 1, 0), transitions[s, 0, max(s - 1, 0)], rewards[s, 0, max(s - 1, 0)]) true_mdp.add_transition(s, 1, s, transitions[s, 1, s], rewards[s, 1, s]) if s < num_states - 1: true_mdp.add_transition( s, 1, min(s + 1, num_states - 1), transitions[s, 1, min(s + 1, num_states - 1)], rewards[s, 1, min(s + 1, num_states - 1)]) if s > 0: true_mdp.add_transition(s, 1, max(s - 1,
def OFVF(num_states, num_actions, num_next_states, valuefunctions, posterior_transition_points, num_update, sa_confidence, discount_factor): """ Method to incrementally improve value function by adding the new value function with previous valuefunctions, finding the nominal point & threshold for this cluster of value functions with the required sa-confidence. @value_function The initially known value function computed from the true MDP @posterior_transition_points The posterior transition points obtained from the Bayesian sampling, nominal point & threshold to be computed @num_update Number of updates over the value functions @sa_confidence Required confidence for each state-action computed from the Union Bound @orig_sol The solution to the estimated true MDP @return valuefunction The updated final value function """ horizon = 1 #s = 0 valuefunctions = [valuefunctions] #Store the nominal points for each state-action pairs nomianl_points = {} #Store the latest nominal of nominal point & threshold nominal_threshold = {} under_estimate, real_regret = 0.0, 0.0 i=0 while i <= num_update: #try: #keep track whether the current iteration keeps the mdp unchanged is_mdp_unchanged = True threshold = [[] for _ in range(3)] rmdp = crobust.MDP(0, discount_factor) #print("update", i) for s in range(num_states): for a in range(num_actions): bayes_points = np.asarray(posterior_transition_points[s,a]) RSVF_nomianlPoints = [] #for bayes_points in trans: #print("bayes_points", bayes_points, "valuefunctions[-1]", valuefunctions[-1]) ivf = construct_uset_known_value_function(bayes_points, valuefunctions[-1], sa_confidence) RSVF_nomianlPoints.append(ivf[2]) new_trp = np.mean(RSVF_nomianlPoints, axis=0) if (s,a) not in nomianl_points: nomianl_points[(s,a)] = [] trp, th = None, 0 #If there's a previously constructed L1 ball. Check whether the new nominal point #resides outside of the current L1 ball & needs to be considered. if (s,a) in nominal_threshold: old_trp, old_th = nominal_threshold[(s,a)][0], nominal_threshold[(s,a)][1] #Compute the L1 distance between the newly computed nominal point & the previous #nominal of nominal points new_th = np.linalg.norm(new_trp - old_trp, ord = 1) #If the new point is inside the previous L1 ball, don't consider it & proceed with #the previous trp & threshold if (new_th - old_th) < 0.0001: trp, th = old_trp, old_th #Consider the new nominal point to construct a new uncertainty set. This block will #execute if there's no previous nominal_threshold entry or the new nominal point #resides outside of the existing L1 ball if trp is None: is_mdp_unchanged = False nomianl_points[(s,a)].append(new_trp) #Find the center of the L1 ball for the nominal points with different #value functions trp, th = find_nominal_point(np.asarray(nomianl_points[(s,a)])) nominal_threshold[(s,a)] = (trp, th) threshold[0].append(s) threshold[1].append(a) threshold[2].append(th) trp /= np.sum(trp) #Add the current transition to the RMDP for next_st in range(num_next_states): rmdp.add_transition(s, a, next_st, trp[int(next_st)], rewards[s,a,next_st]) #Solve the current RMDP rsol = rmdp.rsolve_mpi(b"optimistic_l1",threshold) #If the whole MDP is unchanged, meaning the new value function didn't change the uncertanty #set for any state-action, no need to iterate more! if is_mdp_unchanged or i==num_update-1: #print("**** Add Values *****") #print("MDP remains unchanged after number of iteration:",i) #print("rsol.valuefunction",rsol.valuefunction) return rsol valuefunction = rsol.valuefunction valuefunctions.append(valuefunction) i+=1
def Optimism_VF(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, confidence, num_bayes_samples, num_episodes, num_runs, horizon, true_solution): #num_bayes_samples = 20 num_update = 10 confidences = [] regret_OFVF = np.zeros( (num_runs, num_episodes) ) violations = np.zeros( (num_runs, num_episodes) ) for m in range(num_runs): # Initialize uniform Dirichlet prior prior = np.ones( (num_states, num_actions, num_next_states) ) samples = np.zeros( (num_states, num_actions, num_next_states) ) posterior = prior + samples # Run episodes for the PSRL for k in range(num_episodes): sampled_mdp = crobust.MDP(0, discount_factor) confidence = 1-1/(k+1) sa_confidence = 1-(1-confidence)/(num_states*num_actions) # !!! Apply union bound to compute confidence for each state-action if m==0: confidences.append(confidence) # Compute posterior posterior = posterior+samples thresholds = [[] for _ in range(3)] posterior_transition_points = {} for s in range(num_states): for a in range(num_actions): bayes_samples = np.random.dirichlet(posterior[s,a], num_bayes_samples) posterior_transition_points[(s,a)] = bayes_samples nominal_point_bayes = np.mean(bayes_samples, axis=0) nominal_point_bayes /= np.sum(nominal_point_bayes) bayes_threshold = compute_bayesian_threshold(bayes_samples, nominal_point_bayes, sa_confidence) for s_next in range(num_next_states): sampled_mdp.add_transition(s, a, s_next, nominal_point_bayes[s_next], rewards[s,a,s_next]) # construct the threshold for each state-action thresholds[0].append(s) # from state thresholds[1].append(a) # action thresholds[2].append(bayes_threshold) # allowed deviation # Compute current solution cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds)) # solve_mpi() rsol = OFVF(num_states, num_actions, num_next_states, cur_solution[0], posterior_transition_points, num_update, sa_confidence, discount_factor) regret_OFVF[m,k] = abs(rsol[0][0] - true_solution[0][0]) violations[m,k] = rsol[0][0] - true_solution[0][0] rpolicy = rsol.policy samples = np.zeros((num_states, num_actions, num_next_states)) # Follow the policy to collect transition samples cur_state = 0 for h in range(horizon): action = rpolicy[cur_state] #print("cur_state", cur_state, "cur_action", action) next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0] #print("next_state", next_state) samples[cur_state, action, next_state] += 1 cur_state = next_state #regret_OFVF = np.mean(regret_OFVF, axis=1) #plt.plot(np.cumsum(regret_OFVF)) #plt.show() violations = np.mean(violations<0, axis=0) return np.amin(regret_OFVF, axis=0), np.mean(regret_OFVF, axis=0), violations, confidences
def UCRL2(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, num_episodes, num_runs, horizon, true_solution): """ Implements the UCRL2 algorithm described in Jaksch2010 (Near-optimal Regret Bounds for Reinforcement Learning) paper. Parameters ---------- num_states : int Number of states in the MDP num_actions : int Number of actions in the MDP num_next_states : int Number of possible next states. A common formulation is to keep it same as num_states, but with transition probabilities adjusted accordingly. true_transitions : numpy array num_states x num_actions x num_next_states dimensional array containing tru transition parameters. rewards : numpy array num_states dimensional array containing rewards for each state discount_factor : float Discount factor for the MDP num_episodes : int Number of episodes to run num_runs : int Number of runs in each episode, to take average of true_solution : Solution object of CRAAM The solution of the true MDP Returns -------- numpy array Computed regret """ # ***start UCRL2*** #num_next_states = num_states regret_ucrl = np.zeros( (num_runs, num_episodes) ) #num_runs, num_episodes = 1, 1 for m in range(num_runs): t=1 # state-action counts Nk = np.zeros( (num_states, num_actions) ) Nk_ = np.zeros( (num_states, num_actions) ) # accumulated rewards Rk = np.zeros( (num_states, num_actions) ) # accumulated transition counts, initialized to uniform transition Pk = np.ones( (num_states, num_actions, num_next_states) )/num_next_states for k in range(num_episodes): # ***Initialize tk = t #set the start time of episode k Vk = np.zeros( (num_states, num_actions) ) # init the state-action count for episode k Nk += Nk_ r_hat = [ Rk[s,a]/max(1,Nk[s,a]) for s in range(num_states) for a in range(num_actions)] p_hat = np.array([ Pk[s,a,s_next]/max(1,Nk[s,a]) for s in range(num_states) for a in range(num_actions)\ for s_next in range(num_next_states) ]).reshape((num_states, num_actions,num_next_states)) for s in range(num_states): for a in range(num_actions): p_hat[s,a] /= np.sum(p_hat[s,a]) # ***Compute policy psi_r = [math.sqrt(7*math.log(2*num_next_states*num_actions*tk/confidence)/(2*max(1,Nk[s,a]))) for s in range(num_states) for a in range(num_actions)] psi_a = [math.sqrt(14*num_next_states*math.log(2*num_actions*tk/confidence)/(max(1,Nk[s,a]))) for s in range(num_states) for a in range(num_actions)] estimated_mdp = crobust.MDP(0, discount_factor) thresholds = [[] for _ in range(3)] for s in range(num_states): for a in range(num_actions): for s_next in range(num_next_states): estimated_mdp.add_transition(s, a, s_next, p_hat[s,a,s_next], rewards[s,a,s_next]) # as the reward is upper bounded by psi_r from mean reward r_hat # construct the threshold for each state-action thresholds[0].append(s) # from state thresholds[1].append(a) # action thresholds[2].append(psi_a[a]) # allowed deviation #print(estimated_mdp.to_json()) computed_solution = estimated_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds)) computed_policy = computed_solution.policy regret_ucrl[m,k] = abs(abs(computed_solution[0][0])-true_solution[0][0]) # ***Execute policy Nk_ = np.zeros( (num_states, num_actions) ) cur_state = 0 for h in range(horizon): #Vk[action] < max(1,Nk[action]): action = computed_policy[cur_state] next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0] reward = rewards[cur_state,action,next_state] Vk[cur_state, action] += 1 t += 1 Rk[cur_state, action] += 1 Pk[cur_state, action, next_state] += 1 Nk_[cur_state, action] += 1 cur_state = next_state #regret_ucrl = np.mean(regret_ucrl, axis=0) return np.amin(regret_ucrl, axis=0), np.mean(regret_ucrl, axis=0)
def PSRL(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, num_episodes, num_runs, horizon, true_solution): """ Implements the Posterior Sampling RL algorithm described in Osband2013 ((More) Efficient Reinforcement Learning via Posterior Sampling) paper. Parameters ---------- num_states : int Number of states in the MDP num_actions : int Number of actions in the MDP num_next_states : int Number of possible next states. A common formulation is to keep it same as num_states, but with transition probabilities adjusted accordingly. true_transitions : numpy array num_states x num_actions x num_next_states dimensional array containing tru transition parameters. rewards : numpy array num_states x action x num_states dimensional array containing rewards for each state discount_factor : float Discount factor for the MDP num_episodes : int Number of episodes to run num_runs : int Number of runs in each episode, to take average of true_solution : Solution object of CRAAM The solution of the true MDP Returns -------- numpy array Computed regret """ regret_psrl = np.zeros( (num_runs, num_episodes) ) for m in range(num_runs): # Initialize uniform Dirichlet prior prior = np.ones( (num_states, num_actions, num_next_states) ) samples = np.zeros( (num_states, num_actions, num_next_states) ) posterior = prior + samples # Run episodes for the PSRL for k in range(num_episodes): sampled_mdp = crobust.MDP(0, discount_factor) # Compute posterior posterior = posterior+samples for s in range(num_states): for a in range(num_actions): trp = np.random.dirichlet(posterior[s,a], 1)[0] for s_next in range(num_next_states): sampled_mdp.add_transition(s, a, s_next, trp[s_next], rewards[s,a,s_next]) # Compute current solution cur_solution = sampled_mdp.solve_mpi() cur_policy = cur_solution.policy #action = cur_policy[0] # action for the nonterminal state 0 regret_psrl[m,k] = abs(cur_solution[0][0]-true_solution[0][0]) #print("PSRL cur_solution[0][0]",cur_solution[0][0], "Regret: ", regret_psrl[k,m]) samples = np.zeros((num_states, num_actions, num_next_states)) # Follow the policy to collect transition samples cur_state = 0 for h in range(horizon): action = cur_policy[cur_state] next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0] samples[cur_state, action, next_state] += 1 cur_state = next_state #regret_psrl = np.mean(regret_psrl, axis=1) return np.amin(regret_psrl, axis=0), np.mean(regret_psrl, axis=0)
def BayesUCRL(num_states, num_actions, num_next_states, true_transitions, rewards, discount_factor, confidence, num_bayes_samples, num_episodes, num_runs, horizon, true_solution): """ Implements the Bayes UCRL idea. Computes ambiguity set from posterior samples for required confidence levels. Parameters ---------- num_states : int Number of states in the MDP num_actions : int Number of actions in the MDP num_next_states : int Number of possible next states. A common formulation is to keep it same as num_states, but with transition probabilities adjusted accordingly. true_transitions : numpy array num_states x num_actions x num_next_states dimensional array containing tru transition parameters. rewards : numpy array num_states dimensional array containing rewards for each state discount_factor : float Discount factor for the MDP confidence : float The required PAC confidence num_bayes_samples : int Number of Bayes samples to be taken from posterior num_episodes : int Number of episodes to run num_runs : int Number of runs in each episode, to take average of true_solution : Solution object of CRAAM The solution of the true MDP Returns -------- numpy array Computed regret """ #num_bayes_samples = 20 regret_bayes_ucrl = np.zeros( (num_runs, num_episodes) ) for m in range(num_runs): # Initialize uniform Dirichlet prior prior = np.ones( (num_states, num_actions, num_next_states) ) samples = np.zeros((num_states, num_actions, num_next_states)) posterior = prior + samples # Run episodes for the PSRL for k in range(num_episodes): sampled_mdp = crobust.MDP(0, discount_factor) # Compute posterior posterior = posterior+samples thresholds = [[] for _ in range(3)] confidence = 1-1/(k+1) sa_confidence = 1-(1-confidence)/(num_states*num_actions) # !!! Apply union bound to compute confidence for each state-action for s in range(num_states): for a in range(num_actions): bayes_samples = np.random.dirichlet(posterior[s,a], num_bayes_samples) nominal_point_bayes = np.mean(bayes_samples, axis=0) nominal_point_bayes /= np.sum(nominal_point_bayes) bayes_threshold = compute_bayesian_threshold(bayes_samples, nominal_point_bayes, sa_confidence) for s_next in range(num_next_states): sampled_mdp.add_transition(s, a, s_next, nominal_point_bayes[s_next], rewards[s,a,s_next]) # construct the threshold for each state-action thresholds[0].append(s) # from state thresholds[1].append(a) # action thresholds[2].append(bayes_threshold) # allowed deviation # Compute current solution cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds)) cur_policy = cur_solution.policy #action = cur_policy[0] # action for the nonterminal state 0 regret_bayes_ucrl[m,k] = abs(abs(cur_solution[0][0])-true_solution[0][0]) #print("Bayes UCRL cur_solution[0][0]",cur_solution[0][0], "Regret: ", regret_bayes_ucrl[k,m]) samples = np.zeros((num_states, num_actions, num_next_states)) # Follow the policy to collect transition samples cur_state = 0 for h in range(horizon): action = cur_policy[cur_state] #print("cur_state", cur_state, "cur_action", action) next_state = np.random.choice(num_next_states, 1, p=true_transitions[cur_state, action])[0] #print("next_state", next_state) samples[cur_state, action, next_state] += 1 cur_state = next_state #regret_bayes_ucrl = np.mean(regret_bayes_ucrl, axis=1) return np.amin(regret_bayes_ucrl, axis=0), np.mean(regret_bayes_ucrl, axis=0)
num_next_states = 3 num_actions = 3 discount_factor = 0.95 num_episodes = 100 num_runs = 10 horizon = 10 num_bayes_samples = 100 # rewards for 3 possible next terminal states rewards = np.arange(1, num_next_states + 1, dtype=float) * 10 # 3 possible actions, each action can lead to 3 terminal states with different probabilities. transitions[i] is the transition probability for action i. transitions = np.array([[0.6, 0.2, 0.2], [0.2, 0.6, 0.2], [0.2, 0.2, 0.6]]) # Construct the true MDP with true parameters true_mdp = crobust.MDP(0, discount_factor) for a in range(num_actions): for s in range(num_next_states): true_mdp.add_transition(0, a, s + 1, transitions[a, s], rewards[s]) true_solution = true_mdp.solve_mpi() print(true_solution) ### UCRL2 if __name__ == "__main__": # ***start UCRL2*** t = 1 regret_ucrl = np.zeros((num_runs, num_episodes)) for m in range(num_runs): # state-action counts
def incrementally_replace_V(valuefunction, num_samples, num_simulation,\ num_update, sa_confidence, orig_sol): """ Method to incrementally improve the value function by replacing the old value function with the new one. @value_function The initially known value function @num_samples Number of samples to estimate the true distribution @num_simulation Number of simulation @num_update Number of updates over the value functions @sa_confidence Required confidence for each state-action from @return valuefunction The updated final value function """ horizon = 1 X = [] Y = [] list_transitions_points = {} for s in population: #transitions_points = get_Bootstrapped_transition_reward(s, horizon,\ #num_samples, np.random.randint(len(population))) #print("Incrementally replace V") transitions_points, _, _ = get_Bayesian_transition_kernel(s, num_samples) list_transitions_points[s] = transitions_points under_estimate = 99999 real_regret = 0.0 for i in range(num_update): threshold = [[] for _ in range(3)] rmdp = crobust.MDP(0, discount_factor) for s in population: #transitions_points = get_Bootstrapped_transition_reward(s, horizon, num_samples, i) transitions_points = list_transitions_points[s] #get_Bayesian_transition_kernel(s, num_samples) for a in range(num_actions): dir_points = np.asarray(transitions_points[a]) res = construct_uset_known_value_function(dir_points, valuefunction, sa_confidence) threshold[0].append(s) threshold[1].append(a) threshold[2].append(res[1]) trp = res[2] for next_st in population: #reward = calc_reward(next_st, trp[int(next_st)], a) reward = calc_reward(s, a) rmdp.add_transition(s, a, next_st, trp[int(next_st)], reward) rsol = rmdp.rsolve_mpi(b"robust_l1",threshold) rpolicy = rsol.policy violation = 0 #rret = rmdp.solve_mpi(policy=rpolicy) ret = est_true_mdp.solve_mpi(policy=rpolicy) cur_regret = abs(np.dot(initial,ret.valuefunction) - np.dot(initial,rsol.valuefunction)) if cur_regret>under_estimate: #ropt_sol = est_true_mdp.solve_mpi(policy=rpolicy) real_regret = abs(np.dot(initial,orig_sol.valuefunction) -\ np.dot(initial,ret.valuefunction)) violation = 1 if (np.dot(initial, ret.valuefunction) - np.dot(initial,\ rsol.valuefunction))<0 else 0 break under_estimate = cur_regret valuefunction = rsol.valuefunction X.append(i) Y.append(valuefunction[0]) return under_estimate, real_regret, violation
def RSVF(valuefunctions, posterior_transition_points, num_samples, num_update, \ sa_confidence, orig_sol): """ Method to incrementally improve value function by adding the new value function with previous valuefunctions, finding the nominal point & threshold for this cluster of value functions with the required sa-confidence. @value_function The initially known value function computed from the true MDP @posterior_transition_points The posterior transition points obtained from the Bayesian sampling, nominal point & threshold to be computed @num_samples Number of samples to estimate the true distribution @num_update Number of updates over the value functions @sa_confidence Required confidence for each state-action computed from the Union Bound @orig_sol The solution to the estimated true MDP @return valuefunction The updated final value function """ horizon = 1 X = [] Y = [] valuefunctions = [valuefunctions] th_list = [] """ list_transitions_points = {} for s in population: #transitions_points = get_Bootstrapped_transition_reward(s, horizon,\ #num_samples, np.random.randint(len(population))) #print("incrementally add v") transitions_points, _, _ = get_Bayesian_transition_kernel(s, num_samples) list_transitions_points[s] = transitions_points """ #Store the nominal points for each state-action pairs nomianl_points = {} #Store the latest nominal of nominal point & threshold nominal_threshold = {} under_estimate, real_regret = 0.0, 0.0 i=0 while i <= num_update: try: #keep track whether the current iteration keeps the mdp unchanged is_mdp_unchanged = True threshold = [[] for _ in range(3)] rmdp = crobust.MDP(0, discount_factor) for s in population: for a in range(num_actions): trans = np.asarray(posterior_transition_points[s][a]) RSVF_th = [] # ** Not being used RSVF_nomianlPoints = [] for dir_points in trans: ivf = construct_uset_known_value_function(dir_points, valuefunctions[-1],\ sa_confidence) RSVF_th.append(ivf[1]) RSVF_nomianlPoints.append(ivf[2]) new_trp = np.mean(RSVF_nomianlPoints, axis=0) if (s,a) not in nomianl_points: nomianl_points[(s,a)] = [] trp, th = None, 0 #If there's a previously constructed L1 ball. Check whether the new nominal point #resides outside of the current L1 ball & needs to be considered. if (s,a) in nominal_threshold: old_trp, old_th = nominal_threshold[(s,a)][0], nominal_threshold[(s,a)][1] #Compute the L1 distance between the newly computed nominal point & the previous #nominal of nominal points new_th = np.linalg.norm(new_trp - old_trp, ord = 1) #If the new point is inside the previous L1 ball, don't consider it & proceed with #the previous trp & threshold if (new_th - old_th) < 0.0001: trp, th = old_trp, old_th #Consider the new nominal point to construct a new uncertainty set. This block will #execute if there's no previous nominal_threshold entry or the new nominal point #resides outside of the existing L1 ball if trp is None: is_mdp_unchanged = False nomianl_points[(s,a)].append(new_trp) #Find the center of the L1 ball for the nominal points with different #value functions trp, th = find_nominal_point(np.asarray(nomianl_points[(s,a)])) nominal_threshold[(s,a)] = (trp, th) threshold[0].append(s) threshold[1].append(a) threshold[2].append(th) #Add the current transition to the RMDP for next_st in population: #reward = calc_reward(next_st, trp[int(next_st)], a) reward = calc_reward(s, a) rmdp.add_transition(s, a, next_st, trp[int(next_st)], reward) #Solve the current RMDP rsol = rmdp.rsolve_mpi(b"robust_l1",threshold) violation = 0 #If the whole MDP is unchanged, meaning the new value function didn't change the uncertanty #set for any state-action, no need to iterate more! if is_mdp_unchanged or i==num_update-1: print("**** Add Values *****") print("MDP remains unchanged after number of iteration:",i) #print("rmdp", rmdp.to_json()) #print("threshold", threshold) #print("Policy",rsol.policy, "threshold", threshold) print("rsol.valuefunction",rsol.valuefunction) ropt_sol = est_true_mdp.solve_mpi(policy=rsol.policy) under_estimate = abs(np.dot(initial,orig_sol.valuefunction) -\ np.dot(initial,rsol.valuefunction)) real_regret = abs(np.dot(initial,orig_sol.valuefunction) -\ np.dot(initial,ropt_sol.valuefunction)) violation = 1 if (np.dot(initial, ropt_sol.valuefunction) - \ np.dot(initial, rsol.valuefunction)) < 0 else 0 break valuefunction = rsol.valuefunction valuefunctions.append(valuefunction) X.append(i) Y.append(valuefunction[0]) i+=1 except Exception as e: print("!!! Unexpected Error in RSVF !!!", sys.exc_info()[0]) print(e) continue return under_estimate, real_regret, violation
under_estimation = [[] for _ in range(Methods.NUM_METHODS.value)] #estimated regret real_regret = [[] for _ in range(Methods.NUM_METHODS.value)] #optimal regret violations = [[] for _ in range(Methods.NUM_METHODS.value)] #num_samples = sample_step #pbar = tqdm.tqdm(total = (sample_step*num_iterations+1) ) #while num_samples <= (sample_step*num_iterations+1): for pos, num_samples in enumerate(tqdm.tqdm(sample_steps)): cur_under_estimation = np.zeros( (Methods.NUM_METHODS.value,runs) ) cur_real_regret = np.zeros( (Methods.NUM_METHODS.value,runs) ) cur_violations = np.zeros( (Methods.NUM_METHODS.value,runs) ) i=0 while i<runs: try: est_true_mdp = crobust.MDP(0, discount_factor) rmdps = [] for m in range(Methods.NUM_METHODS.value): rmdps.append(crobust.MDP(0, discount_factor)) posterior_transition_points = {} for s in population: #Get the nominal points & thresholds for each state & all actions of Bayes, Mean, Hoeff, #HoeffTight RMDPs. Get the true transition points & the posterior transition points for RSVF params, true_transition_points, posterior_transition_points[s] = \ evaluate_uncertainty_set(s, num_samples, num_simulation, sa_confidence) #Construct the true MDP with true transition points for a in range(num_actions): for next_st in population:
def train(self, q_init_ret): """ Implement posterior sampling RL algorithm for Mountain Car problem of OpenAI Returns -------- worst case regret, average regret and the final solution """ regret_psrl = np.zeros((self.num_runs, self.num_episodes)) prior = obtain_parametric_priors(self.resolution, self.num_actions) # initially the posterior is the same as prior posterior = prior mp_rewards = {} for run in range(self.num_runs): print("run: ", run) # run episodes for the PSRL for episode in range(self.num_episodes): print("episode: ", episode) sampled_mdp = crobust.MDP(0, self.discount_factor) #thresholds = [[] for _ in range(3)] # iterate over all state-actions, sample from the posterior distribution and construct the sampled MDP for s in self.all_states: p, v = obs_to_index(s, self.env_low, self.env_dx) cur_state = index_to_single_index(p, v, self.resolution) for action in range(self.num_actions): samples = posterior[p, v, action] next_states = [] visit_stats = [] # unbox key(next states) and values(Dirichlet prior parameters) from the samples dictionary. for key, value in samples.items(): next_states.append( index_to_single_index(key[0], key[1], self.resolution)) visit_stats.append(value) # sample from the drichlet distribution stated with the prior parameters trp = np.random.dirichlet(visit_stats, 1)[0] for s_index, s_next in enumerate(next_states): sampled_mdp.add_transition( cur_state, action, s_next, trp[s_index], get_reward(s_next, self.resolution, self.grid_x, self.grid_y)) # construct the threshold for each state-action #thresholds[0].append(cur_state) # from state #thresholds[1].append(action) # action #thresholds[2].append(1.0) # allowed deviation # Compute current solution cur_solution = sampled_mdp.solve_mpi() cur_policy = cur_solution.policy # Compute current solution #cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds)) #cur_policy = cur_solution.policy # Initial state is uniformly distributed, compute the expected value over them. expected_value_initial_state = 0 for init in self.init_states: state = index_to_single_index(init[0], init[1], self.resolution) expected_value_initial_state += cur_solution[0][state] expected_value_initial_state /= len(self.init_states) # regret computation needs to be implemented. The Q-learning solution can be considered as the true soultion. # the solution produced here by PSRL is the approximate solution and the difference between them is the regret regret_psrl[run, episode] = abs( q_init_ret - expected_value_initial_state ) #abs(cur_solution[0][0]-true_solution[0][0]) print(q_init_ret, expected_value_initial_state) # Follow the policy to collect transition samples cur_state = obs_to_index(self.env.reset(), self.env_low, self.env_dx) for h in range(self.horizon): action = cur_policy[index_to_single_index( cur_state[0], cur_state[1], self.resolution)] next_state, reward, done, info = self.env.step(action) next_state = obs_to_index(next_state, self.env_low, self.env_dx) mp_rewards[index_to_single_index(next_state[0], next_state[1], self.resolution)] = reward # posterior[cur_position, cur_velocity, action][next_position, next_velocity] is the entry that we wanna update # with the sample. This is really combining the current sample with the prior, which constitutes the posterior. if (next_state[0], next_state[1]) not in posterior[cur_state[0], cur_state[1], action]: posterior[cur_state[0], cur_state[1], action][next_state[0], next_state[1]] = 0 posterior[cur_state[0], cur_state[1], action][next_state[0], next_state[1]] += 1 cur_state = next_state if done: print("----- destination reached in", h, "steps, done execution. -----") break return np.amin(regret_psrl, axis=0), np.mean(regret_psrl, axis=0), cur_solution
def train(self, num_bayes_samples, q_init_ret): """ Implements the Bayes UCRL idea. Computes ambiguity set from posterior samples for required confidence levels. Returns -------- numpy array Computed regret """ #num_bayes_samples = 20 regret_bayes_ucrl = np.zeros( (self.num_runs, self.num_episodes) ) prior = obtain_parametric_priors(self.resolution, self.num_actions) # initially the posterior is the same as prior posterior = prior for run in range(self.num_runs): for episode in range(self.num_episodes): sampled_mdp = crobust.MDP(0, self.discount_factor) # Compute posterior #posterior = posterior+samples thresholds = [[] for _ in range(3)] num_states = self.resolution*self.resolution confidence = 1-1/(episode+1) sa_confidence = 1-(1-confidence)/(num_states*self.num_actions) # !!! Apply union bound to compute confidence for each state-action # iterate over all state-actions, sample from the posterior distribution and construct the sampled MDP for s in self.all_states: p,v = obs_to_index(s, self.env_low, self.env_dx) print("s",s,p,v) cur_state = index_to_single_index(p,v, self.resolution) for action in range(self.num_actions): samples = posterior[p,v,action] next_states = [] visit_stats = [] # unbox key(next states) and values(Dirichlet prior parameters) from the samples dictionary. for key, value in samples.items(): next_states.append(index_to_single_index(key[0],key[1], self.resolution)) visit_stats.append(value) # sample from the drichlet distribution stated with the prior parameters # trp = np.random.dirichlet(visit_stats, 1) bayes_samples = np.random.dirichlet(visit_stats, num_bayes_samples) nominal_point_bayes = np.mean(bayes_samples, axis=0) nominal_point_bayes /= np.sum(nominal_point_bayes) bayes_threshold = compute_bayesian_threshold(bayes_samples, nominal_point_bayes, sa_confidence) for s_index, s_next in enumerate(next_states): sampled_mdp.add_transition(cur_state, action, s_next, nominal_point_bayes[s_index], get_reward(s_next, self.resolution, self.grid_x, self.grid_y)) # construct the threshold for each state-action thresholds[0].append(cur_state) # from state thresholds[1].append(action) # action thresholds[2].append(bayes_threshold) # allowed deviation # Compute current solution cur_solution = sampled_mdp.rsolve_mpi(b"optimistic_l1",np.array(thresholds)) cur_policy = cur_solution.policy # Initial state is uniformly distributed, compute the expected value over them. expected_value_initial_state = 0 for init in self.init_states: state = index_to_single_index(init[0], init[1], self.resolution) expected_value_initial_state += cur_solution[0][state] expected_value_initial_state /= len(self.init_states) # regret computation needs to be implemented. The Q-learning solution can be considered as the true soultion. # the solution produced here by PSRL is the approximate solution and the difference between them is the regret regret_bayes_ucrl[run,episode] = abs(q_init_ret-expected_value_initial_state) #abs(cur_solution[0][0]-true_solution[0][0]) # Follow the policy to collect transition samples cur_state = obs_to_index(self.env.reset(), self.env_low, self.env_dx) for h in range(self.horizon): action = cur_policy[index_to_single_index(cur_state[0], cur_state[1], self.resolution)] next_state, reward, done, info = self.env.step(action) next_state = obs_to_index(next_state, self.env_low, self.env_dx) # posterior[cur_position, cur_velocity, action][next_position, next_velocity] is the entry that we wanna update # with the sample. This is really combining the current sample with the prior, which constitutes the posterior. if (next_state[0],next_state[1]) not in posterior[cur_state[0],cur_state[1],action]: posterior[cur_state[0],cur_state[1],action][next_state[0],next_state[1]] = 0 posterior[cur_state[0],cur_state[1],action][next_state[0],next_state[1]] += 1 cur_state = next_state if done: print("----- destination reached in",h,"steps, done execution. -----") break return np.amin(regret_bayes_ucrl, axis=0), np.mean(regret_bayes_ucrl, axis=0), cur_solution
def incrementally_add_V(valuefunctions, num_samples, num_simulation,\ num_update, sa_confidence, orig_sol): """ Method to incrementally improve value function by adding the new value function with previous valuefunctions, finding the nominal point & threshold for this cluster of value functions with the required sa-confidence. @value_function The initially known value function @num_samples Number of samples to estimate the true distribution @num_simulation Number of simulation @num_update Number of updates over the value functions @sa_confidence Required confidence for each state-action from @return valuefunction The updated final value function """ X = [] Y = [] valuefunctions = [valuefunctions] th_list = [] list_transitions_points = {} for s in range(num_total_states): for a in range(num_total_actions): transitions_points = np.random.dirichlet(transition_samples[s, a], bayes_samples) list_transitions_points[(s, a)] = transitions_points #Store the nominal points for each state-action pairs nomianl_points = {} #Store the latest nominal of nominal point & threshold nominal_threshold = {} under_estimate, real_regret = 0.0, 0.0 for i in range(num_update): #print("valuefunctions",i,": ",valuefunctions) #keep track whether the current iteration keeps the mdp unchanged is_mdp_unchanged = True threshold = [[] for _ in range(3)] rmdp = crobust.MDP(0, discount_factor) for s in range(num_total_states): for a in range(num_total_actions): dir_points = list_transitions_points[( s, a)] #np.asarray(transitions_points[a]) res = construct_uset_known_value_function(dir_points, valuefunctions[-1],\ sa_confidence) if (s, a) not in nomianl_points: nomianl_points[(s, a)] = [] trp, th = None, 0 #If there's a previously constructed L1 ball. Check whether the new nominal point #needs to be considered. if (s, a) in nominal_threshold: old_trp, old_th = nominal_threshold[( s, a)][0], nominal_threshold[(s, a)][1] #Compute the L1 distance between the newly computed nominal point & the previous #nominal of nominal points new_th = np.linalg.norm(res[2] - old_trp, ord=1) #If the new point is inside the previous L1 ball, don't consider it & proceed with #the previous trp & threshold if (new_th - old_th) < 0.0001: trp, th = old_trp, old_th #Consider the new nominal point to construct a new uncertainty set. This block will #execute if there's no previous nominal_threshold entry or the new nominal point #resides outside if trp is None: #print(i,"trp is None") is_mdp_unchanged = False nomianl_points[(s, a)].append(res[2]) #Find the center of the L1 ball for the nominal points with different #value functions trp, th = find_nominal_point( np.asarray(nomianl_points[(s, a)])) nominal_threshold[(s, a)] = (trp, th) threshold[0].append(s) threshold[1].append(a) threshold[2].append(th) for next_st in range(num_total_states): reward = state_action_reward[(index_to_state[s], index_to_action[a])] rmdp.add_transition(s, a, next_st, trp[int(next_st)], reward) rsol = rmdp.rsolve_mpi(b"robust_l1", threshold) violation = 0 #If the whole MDP is unchanged, meaning the new value function didn't change the uncertanty #set for any state-action, no need to iterate more! if is_mdp_unchanged or i == num_update - 1: print("**** Add Values *****") print("MDP remains unchanged after number of iteration:", i) print("Policy", rsol.policy, "threshold", threshold) rpolicy = rsol.policy ret = est_true_mdp.solve_mpi(policy=rpolicy) under_estimate = np.dot(initial, ret.valuefunction) - np.dot( initial, rsol.valuefunction) #ropt_sol = rmdp.solve_mpi(policy=orig_sol.policy) real_regret = np.dot(initial,orig_sol.valuefunction) -\ np.dot(initial,ret.valuefunction) violation = 1 if (np.dot(initial, ret.valuefunction) - np.dot(initial,\ rsol.valuefunction))<0 else 0 break valuefunction = rsol.valuefunction valuefunctions.append(valuefunction) X.append(i) Y.append(valuefunction[0]) return under_estimate, real_regret, violation