def FeichterPolicy(mdp, start_state=0, epsilon=1, randomseed=None, delta=0.1): global c if (randomseed is not None): np.random.seed(randomseed) # orig_stdout = sys.stdout # f = open('Fiechter-m01.txt', 'w') # sys.stdout = f ##### Initialisation print(mdp.Vmax, 6 / epsilon, mdp.discountFactor) H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) / (1 - mdp.discountFactor)) print("Chosen value of H is : ", H) N_h_s_a = np.zeros((H, mdp.numStates, mdp.numActions)) N_h_s_a_s_prime = np.zeros( (H, mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int) rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) P_h_s_a_s_prime = np.zeros( (H, mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) policy_h_s = np.zeros((H, mdp.numStates), dtype=np.int) d_h_policy_s = np.zeros((H + 1, mdp.numStates)) dmax = 12 * mdp.Vmax / (epsilon * (1 - mdp.discountFactor)) converge_iterations = 10000 epsilon_convergence = 1e-4 Qlower = np.zeros((mdp.numStates, mdp.numActions)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions]) QstarMBAE = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.random.random([mdp.numStates]) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) it = 0 samples = 0 initial_iterations = 1 * mdp.numStates * mdp.numActions ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 #### For starting the while loop below iteration = 1 if (verbose == 0): outp = open(mdp.filename + '-fiechter' + str(randomseed) + '.txt', 'wb') # sys.stdout = open(mdp.filename+'-fiechter.txt', 'w+') ff = open(mdp.filename + '-fiechter-samples.txt', 'w+') #### Exploration # while d_h_policy_s[0][start_state]>2/(1-mdp.discountFactor) or iteration==1: acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) coll = Qupper[start_state][acList[1]] - Qlower[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 while coll > 0 or iteration < 50: # print d_h_policy_s[0][start_state], " > ", 2/(1-mdp.discountFactor) # print policy_h_s[0] h = 0 current_state = start_state while h < H: current_action = policy_h_s[h][current_state] # print "------>",current_state, current_action s_prime, r = mdp.simulate(current_state, current_action) N_h_s_a[h][current_state][current_action] += 1 rewards_s_a_sprime[current_state][current_action][s_prime] += r R_s_a[state][act] = ( r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) N_h_s_a_s_prime[h][current_state][current_action][s_prime] += 1 N_s_a_sprime[current_state][current_action][s_prime] += 1 sampled_frequency_s_a[current_state][current_action] += 1 for s2 in range(mdp.numStates): P_h_s_a_s_prime[h][current_state][current_action][ s2] = N_h_s_a_s_prime[h][current_state][current_action][ s2] / N_h_s_a[h][current_state][current_action] h += 1 current_state = s_prime samples += 1 if (samples % 100 == 0): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, QstarMBAE) if (verbose == 0): outp.write(str(samples)) outp.write('\t') outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) ) #-epsilon*(1-mdp.discountFactor)/2 outp.write('\n') else: print(Qupper[start_state], Qlower[start_state]) # print d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor) # print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])-epsilon*(1-mdp.discountFactor)/2 np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') # print samples, d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor) # Compute new policy dynamic program e_s_a = np.zeros((mdp.numStates, mdp.numActions)) for h in range(H - 1, -1, -1): for state in range(mdp.numStates): current_max = -float("inf") argmax_action = -1 for act in range(mdp.numActions): if (N_h_s_a[h][state][act] == 0): e_s_a[state][act] = dmax else: sqterm = (2 * math.log( 4 * H * mdp.numStates * mdp.numActions) - 2 * math.log(delta)) / N_h_s_a[h][state][act] summation = np.sum( (N_h_s_a_s_prime[h][state][act] / N_h_s_a[h][state][act]) * d_h_policy_s[h + 1]) secondterm = mdp.discountFactor * summation e_s_a[state][act] = min( dmax, 6 * mdp.Vmax * (math.sqrt(sqterm)) / (epsilon * (1 - delta)) + secondterm) policy_h_s[h][state] = np.argmax(e_s_a[state]) d_h_policy_s[h][state] = np.amax(e_s_a[state]) # Compute MBAE QupperMBAE and QlowerMBAE bounds for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state][act] ) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (samples**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm QstarMBAE[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(QstarMBAE[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): break for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) # Qstar, _ = iteratedConvergence(Qstar,R_s_a,P_,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) iteration += 1 acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, QstarMBAE) coll = QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 # sys.stdout = orig_stdout # f.close() print(iteration) a = open('final' + mdp.filename + '-fiechter.txt', 'a+') a.write(str(iteration) + '\n') a.close() return getBestPolicy(mdp, rewards_s_a_sprime, P_h_s_a_s_prime[0]) # return policy_h_s[0]
def LUCBEpisodic(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1, fileprint=1): if (randomseed is not None): np.random.seed(randomseed) global MAX_ITERATION_LIMIT, c iteration = 0 it = 0 H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) / (1 - mdp.discountFactor)) initial_iterations = 1 * mdp.numStates * mdp.numActions rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.random.random([mdp.numStates]) Qlower = np.zeros((mdp.numStates, mdp.numActions)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions]) final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) states_to_sample = range(mdp.numStates) colliding_values = np.zeros((mdp.numStates)) is_converged = 0 print "Vmax", mdp.Vmax print "Epsilon is ", epsilon ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 for s2 in range(mdp.numStates): P[state][act][s2] = (float)( N_s_a_sprime[state][act] [s2]) / sampled_frequency_s_a[state][act] ### Calculating V, Q estimates thus far MBAE for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): print "Stopping with ", internal, "initial internal iterations" break if internal == converge_iterations: print "Used all iterations" print "Initial estimate of QupperMBAE found! Now sampling" Qupper = np.copy(QupperMBAE) Qlower = np.copy(QlowerMBAE) if (verbose == 0): outp = open(mdp.filename + '-lucbeps' + str(randomseed) + '.txt', 'wb') ff = open(mdp.filename + '-lucbeps-samples.txt', 'w+') h = 0 state1 = start_state iteration += initial_iterations while iteration < MAX_ITERATION_LIMIT: max_collision_state = [ sorted(states_to_sample, key=lambda x: colliding_values[x], reverse=True)[0] ] if (h % H == 0): state1 = start_state h = 0 else: state1 = nextstate actionsList = bestTwoActions(mdp, state1, QlowerMBAE, QupperMBAE, Qstar) a = np.random.choice(actionsList) iteration += 1 for t in range(1): s_prime, r = mdp.simulate(state1, a) nextstate = s_prime rewards_s_a_sprime[state1][a][s_prime] += r R_s_a[state1][act] = (r + R_s_a[state1][act] * sampled_frequency_s_a[state1][act]) / ( sampled_frequency_s_a[state1][act] + 1) sampled_frequency_s_a[state1][a] += 1 N_s_a_sprime[state1][a][s_prime] += 1 if (verbose == 1): pass # print "s, a, sprime" # print state1, a, s_prime for s2 in range(mdp.numStates): P[state1][act][s2] = (float)( N_s_a_sprime[state1][act] [s2]) / sampled_frequency_s_a[state1][act] ## Calculating Q and V values for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) if (verbose == 1): pass Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) if (verbose == 1): # print "Calculated Q values are :" print QupperMBAE[start_state], Qstar[start_state], QlowerMBAE[ start_state] # Calculations for QupperMBAE and QlowerMBAE #### This involved a two for-loop and iterating convergence for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state][act] ) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (iteration**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): break count = 0 if (iteration % 100 == 0): acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) # print "Qupper, Qstar, Qlower" # print Qupper[start_state], Qstar[start_state], Qlower[start_state] if (verbose == 0): outp.write(str(iteration)) outp.write('\t') outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) ) #-epsilon*(1-mdp.discountFactor)/2 outp.write('\n') else: print iteration, QupperMBAE[start_state][ acList[1]] - QlowerMBAE[start_state][acList[0]] np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') ##### Updating the list of coliliding states if (iteration > 50): states_to_sample = [] for st in range(mdp.numStates): acList = bestTwoActions(mdp, st, QlowerMBAE, QupperMBAE, Qstar) ##### Changing stopping condition to epsilon*(1-gamma)/2 colliding_values[st] = QupperMBAE[st][acList[1]] - QlowerMBAE[ st][acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 if (colliding_values[st] > 0): ### this state is still colliding, add to sample states states_to_sample.append(st) else: # for st in range(mdp.numStates): # acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar) # colliding_values[st] = Qupper[st][acList[1]]-Qlower[st][acList[0]]-epsilon*(1-mdp.discountFactor)/2 colliding_values = range(mdp.numStates) states_to_sample = range(mdp.numStates) #### Check epsilon condition for only starting state if (not (start_state in states_to_sample) and iteration > 50): # if(count==mdp.numStates): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) print "Difference is ", Qupper[st][acList[1]] - Qlower[st][ acList[0]] print "Setting final_policy of ", start_state, " to", acList[0] final_policy[start_state] = acList[0] print "Iterations taken : ", iteration for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] print "Returning policy : ", final_policy if (iteration != 51): a = open('final' + mdp.filename + '-lucbeps.txt', 'a+') a.write(str(iteration) + '\n') a.close() return final_policy h += 1 outp.close() ff.close() for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] return final_policy
def mbie(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1): global c if(randomseed is not None): np.random.seed(randomseed) initial_iterations = 1*mdp.numStates*mdp.numActions ### Estimate the horizon based on Fiechter H = int((math.log(mdp.Vmax) + math.log(6.0/epsilon))/(1-mdp.discountFactor)) it=0 samples = 0 ### Calculating m based on the parameters first_term = mdp.numStates/(epsilon**2*(1-mdp.discountFactor)**4) second_term = math.log(mdp.numStates*mdp.numActions/(epsilon*(1-mdp.discountFactor)*delta))/(epsilon**2*(1-mdp.discountFactor)**4) m = c*(first_term+second_term) print "Chosen value of m is :",H, m N_s_a = np.zeros((mdp.numStates,mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) P_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) P_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) R_s_a = np.zeros((mdp.numStates,mdp.numActions)) Qupper = mdp.Vmax*np.random.random([mdp.numStates,mdp.numActions]) QupperMBAE = mdp.Vmax*np.ones((mdp.numStates,mdp.numActions)) Qlower = np.zeros((mdp.numStates,mdp.numActions)) QlowerMBAE = np.zeros((mdp.numStates,mdp.numActions)) Qstar = (mdp.Vmax/2)*np.ones((mdp.numStates,mdp.numActions)) Vupper = mdp.Vmax*np.random.random([mdp.numStates]) VupperMBAE = mdp.Vmax*np.ones((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax/2)*np.ones((mdp.numStates)) best_policy = (-1)*np.ones((mdp.numStates), dtype=np.int) ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it+=1 ss, rr = mdp.simulate(state, act) R_s_a[state][act] = rr N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] samples += initial_iterations print P_s_a_sprime print "Completed initial iterations" Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_s_a_sprime,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) print Qupper, "Qupper" # print Qupper, Vupper current_state = start_state ### Repeat forever if(verbose==0): outp = open(mdp.filename+'-mbie' + str(randomseed) +'.txt', 'wb') # sys.stdout = open(mdp.filename+'-mbie.txt', 'w+') ff = open(mdp.filename+'-mbie-samples.txt', 'w+') while samples<MAX_ITERATION_LIMIT: current_state = start_state h=1 # print Qupper[start_state], Qstar[start_state], Qlower[start_state] while h<=H: if(samples%100==0): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) if(verbose==0): outp.write(str(samples)) outp.write('\t') outp.write(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]))#-epsilon*(1-mdp.discountFactor)/2 outp.write('\n') print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]) else: print samples, (QupperMBAE[start_state][acList[1]],QlowerMBAE[start_state][acList[0]]) pass np.savetxt(ff, N_s_a, delimiter=',') ff.write('\n') for i in range(mdp.numStates): # print "For state ", i, " doing UpperP" for j in range(mdp.numActions): P_tilda[i][j] = UpperP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vupper,False) P_lower_tilda[i][j] = LowerP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vlower,False) # print "Starting iterating" # print Qupper # return 2 Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_tilda,mdp.discountFactor,epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower,R_s_a,P_lower_tilda,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) current_action = np.argmax(QupperMBAE[current_state]) # print Qupper[start_state], Qlower[start_state] best_policy[current_state] = current_action if(N_s_a[current_state][current_action]<m): for t in range(1): ss,rr = mdp.simulate(current_state, current_action) R_s_a[current_state][current_action] = (rr + R_s_a[current_state][current_action]*N_s_a[current_state][current_action])/(N_s_a[current_state][current_action]+1) N_s_a[current_state][current_action] += 1 N_s_a_sprime[current_state][current_action][ss] += 1 samples += 1 for s2 in range(mdp.numStates): # print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action] P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] current_state = ss else: print "TRUEEEE" print N_s_a[current_state] # print P_s_a_sprime[current_state][current_action] # print np.sum(P_s_a_sprime[current_state][current_action]) # print N_s_a[current_state][current_action] current_state = np.random.choice(np.arange(mdp.numStates), p=P_s_a_sprime[current_state][current_action]/np.sum(P_s_a_sprime[current_state][current_action])) h += 1 # Compute MBAE Qupper and Qlower bounds for internal in range(converge_iterations): oldQlower = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for Qupper and Qlower mBAE firstterm = R_s_a[state][act] secondterm = mdp.discountFactor*np.sum(VupperMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act])) #secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act])) star_secondterm = mdp.discountFactor*np.sum(Vstar*(N_s_a_sprime[state][act]/N_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*mdp.numActions)-math.log(delta))/N_s_a[state][act]) #Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm # print firstterm, secondterm, thirdterm QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if(np.linalg.norm(oldQlower-QlowerMBAE[start_state])<=epsilon_convergence): # print "Stopping with ", internal, "iterations" break return best_policy
def ddvouu(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1): if (randomseed is not None): np.random.seed(randomseed) initial_iterations = 1 * mdp.numStates * mdp.numActions ### Estimate the horizon based on Fiechter c = 1 it = 0 samples = 0 first_term = mdp.numStates / (epsilon**2 * (1 - mdp.discountFactor)**4) second_term = math.log( mdp.numStates * mdp.numActions / (epsilon * (1 - mdp.discountFactor) * delta)) / (epsilon**2 * (1 - mdp.discountFactor)**4) m = c * (first_term + second_term) delta = delta / (mdp.numStates * mdp.numActions * m) print("Chosen value of m is :", m) N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int) P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qlower = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Vupper = mdp.Vmax * np.ones((mdp.numStates)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) best_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) deltadeltaV = np.zeros((mdp.numStates, mdp.numActions)) discovered_states = set([start_state, 1, 2, 3, 4]) ## Initial sampling for all state action pairs ### Is this needed? while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 ss, rr = mdp.simulate(state, act) print("Sampling ", state, act, rr, ss) R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act] ) / (N_s_a[state][act] + 1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)( N_s_a_sprime[state][act][s2]) / N_s_a[state][act] samples += initial_iterations print(P_s_a_sprime) print("Completed initial iterations") if (verbose == 0): outp = open(mdp.filename + '-ddv' + str(randomseed) + '.txt', 'wb') # sys.stdout = open(mdp.filename+'-ddv.txt', 'w+') ff = open(mdp.filename + '-ddv-samples.txt', 'w+') # print Qupper, Vupper current_state = start_state ### Repeat forever while samples < MAX_ITERATION_LIMIT: # print Qupper[start_state], Qlower[start_state] for i in range(mdp.numStates): for j in range(mdp.numActions): if (N_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) ##Calculate Q values Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) current_state = start_state ### Terminating condition if (use_mbae): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) coll = QupperMBAE[start_state][ acList[1]] - QlowerMBAE[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 else: acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) coll = Qupper[start_state][acList[1]] - Qlower[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 # if(Vupper[start_state]-Vlower[start_state]<=epsilon and samples>50): if (coll < 0 and samples > 50): a = open('final' + mdp.filename + '-ddv.txt', 'a+') a.write(str(samples) + '\n') a.close() print(Qupper[start_state], Vupper[start_state], Vlower[start_state]) policy_lower = np.argmax(Qlower, axis=1) print("Iteration number ", samples) print("Returning policy because of epsilon-convergence") print(policy_lower) print(np.argmax(QupperMBAE, axis=1)) print(np.argmax(Qupper, axis=1)) print(np.argmax(QlowerMBAE, axis=1)) print(np.argmax(Qstar, axis=1)) return policy_lower ## Caclulate deldelV for all states if (use_mbae): for st in list(discovered_states): for ac in range(mdp.numActions): #### Compute del del V deltadeltaV[st][ac] = CalculateDelDelV( st, ac, mdp, N_s_a_sprime, QupperMBAE, QlowerMBAE, VupperMBAE, VlowerMBAE, start_state, P_s_a_sprime, P_tilda, P_lower_tilda, R_s_a, epsilon, delta, converge_iterations, epsilon_convergence) else: for st in list(discovered_states): for ac in range(mdp.numActions): #### Compute del del V deltadeltaV[st][ac] = CalculateDelDelV( st, ac, mdp, N_s_a_sprime, Qupper, Qlower, Vupper, Vlower, start_state, P_s_a_sprime, P_tilda, P_lower_tilda, R_s_a, epsilon, delta, converge_iterations, epsilon_convergence) #### Simulate greedily wrt deldelV # print np.unravel_index(deltadeltaV.argmax(), deltadeltaV.shape) current_state, current_action = np.unravel_index( deltadeltaV.argmax(), deltadeltaV.shape) #time.sleep(0.1) print(deltadeltaV) ss, rr = mdp.simulate(current_state, current_action) samples += 1 print("Sampling ", current_state, current_action, rr, ss) #### Add received state to the set of discovered states #discovered_states.add(ss) print(discovered_states) ### Update believed model R_s_a[current_state][current_action] = ( rr + R_s_a[current_state][current_action] * N_s_a[current_state][current_action]) / ( N_s_a[current_state][current_action] + 1) N_s_a[current_state][current_action] += 1 N_s_a_sprime[current_state][current_action][ss] += 1 for s2 in range(mdp.numStates): # print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action] P_s_a_sprime[current_state][current_action][s2] = (float)( N_s_a_sprime[current_state][current_action] [s2]) / N_s_a[current_state][current_action] if (samples % 100 == 0): if (use_mbae): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) else: acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) if (verbose == 0): outp.write(str(samples)) outp.write('\t') if (plot_vstar): outp.write(str(Vstar[start_state])) else: if (use_mbae): outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) else: outp.write( str(Qupper[start_state][acList[1]] - Qlower[start_state][acList[0]])) outp.write('\n') if (use_mbae): print(samples, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) else: print(samples, (Qupper[start_state][acList[1]] - Qlower[start_state][acList[0]])) else: print(samples, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) np.savetxt(ff, N_s_a, delimiter=',') ff.write('\n') ### Calculating MBAE bounds for internal in range(converge_iterations): oldQlower = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for Qupper and Qlower firstterm = R_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / N_s_a[state][act])) #secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / N_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / N_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (samples**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / N_s_a[state][act]) #Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlower - QlowerMBAE[start_state]) <= epsilon_convergence): # print "Stopping with ", internal, "iterations" break # if(samples==initial_iterations+2): # Qupper = np.copy(QupperMBAE) # Qlower = np.copy(QlowerMBAE) return best_policy
def RoundRobin(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1): global MAX_ITERATION_LIMIT, c if (randomseed is not None): np.random.seed(randomseed) iteration = 0 it = 0 initial_iterations = 1 * mdp.numStates * mdp.numActions rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.random.random([mdp.numStates]) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Qlower = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions]) final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) states_to_sample = range(mdp.numStates) colliding_values = np.zeros((mdp.numStates)) is_converged = 0 ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 for s2 in range(mdp.numStates): P[state][act][s_prime] = (float)( N_s_a_sprime[state][act] [s_prime]) / sampled_frequency_s_a[state][act] ### Calculating V, Q estimates thus far for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Qupper = np.copy(QupperMBAE) Qlower = np.copy(QlowerMBAE) if (verbose == 0): outp = open(mdp.filename + '-rr' + str(randomseed) + '.txt', 'wb') ff = open(mdp.filename + '-rr-samples.txt', 'w+') while iteration < MAX_ITERATION_LIMIT: # print "Sampling state ", max_collision_state[0] # print colliding_values for state1 in range(mdp.numStates): # print "Sampling ", state1, "for this round" for act1 in range(mdp.numActions): iteration += 1 sampled_frequency_s_a[state1][act1] += 1 # Simluate the MDP with this state,action and update counts #### TRying 10 continuous simulations for t in range(1): s_prime, r = mdp.simulate(state1, act1) rewards_s_a_sprime[state1][act1][s_prime] += r R_s_a[state][act] = ( r + R_s_a[state][act] * sampled_frequency_s_a[state][act] ) / (sampled_frequency_s_a[state][act] + 1) N_s_a_sprime[state1][act1][s_prime] += 1 for s2 in range(mdp.numStates): P[state1][act][s_prime] = (float)( N_s_a_sprime[state1][act] [s_prime]) / sampled_frequency_s_a[state1][act] ## Calculating Q and V values for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP( i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence( Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) # Calculations for QupperMBAE and QlowerMBAE #### This involved a two for-loop and iterating convergence for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum( rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (iteration**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][ act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) count = 0 # print iteration, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])/epsilon, sampled_frequency_s_a if (iteration % 100 == 0): for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) if (verbose == 0): outp.write(str(iteration)) outp.write('\t') outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) ) #-epsilon*(1-mdp.discountFactor)/2 # print(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])) # print(iteration, QupperMBAE[start_state]) # outp.write(str(evaluatePolicy(mdp, final_policy, start_state))) print(str(evaluatePolicy(mdp, final_policy, start_state))) outp.write('\n') else: print(iteration, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) # print iteration, (Qupper[start_state][acList[1]]-Qlower[start_state][acList[0]]) np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') # print iteration #### Check epsilon condition for only starting state acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) if (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]] < epsilon * (1 - mdp.discountFactor) / 2 and iteration > 50): print( QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]], "<", epsilon * (1 - mdp.discountFactor) / 2) # if(count==mdp.numStates): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) a = open('final' + mdp.filename + '-rr.txt', 'a+') a.write(str(iteration) + '\n') a.close() print("Setting final_policy of ", start_state, " to", acList[0]) final_policy[start_state] = acList[0] print("Iterations taken : ", iteration) print("Returning the policy :", final_policy) for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] return final_policy for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] return final_policy
def LUCBBound(mdp, start_state=0, epsilon=4, delta=0.1, fileprint=1): global MAX_ITERATION_LIMIT, c iteration = 0 it = 0 H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) / (1 - mdp.discountFactor)) initial_iterations = 1 * mdp.numStates * mdp.numActions rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.ones((mdp.numStates)) Qlower = np.zeros((mdp.numStates, mdp.numActions)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) states_to_sample = range(mdp.numStates) colliding_values = np.zeros((mdp.numStates)) converge_iterations = 10000 epsilon_convergence = 1e-4 is_converged = 0 print "Vmax", mdp.Vmax ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 ### Calculating V, Q estimates thus far MBAE for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) # if(state==start_state and abs(VupperMBAE[state]-QupperMBAEmax)<epsilon_convergence): # VupperMBAE[state] = QupperMBAEmax # print "Stopping with ", internal, "initial internal iterations" # is_converged = 1 # break VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): print "Stopping with ", internal, "initial internal iterations" break if internal == converge_iterations: print "Used all iterations" print "Initial estimate of QupperMBAE found! Now sampling" sys.stdout = open(mdp.filename + '-lucbbound.txt', 'w+') ff = open(mdp.filename + '-lucbbound-samples.txt', 'w+') h = 0 state1 = start_state while iteration < MAX_ITERATION_LIMIT: max_collision_state = [ sorted(states_to_sample, key=lambda x: colliding_values[x], reverse=True)[0] ] # print "Sampling state ", max_collision_state[0] # print colliding_values # print "Sampling ", state1, "for this round" if (h % H == 0): state1 = start_state h = 0 else: state1 = nextstate actionsList = bestTwoActions(mdp, state1, Qlower, Qupper, Qstar) a = np.random.choice(actionsList) iteration += 1 sampled_frequency_s_a[state1][a] += 1 for t in range(1): s_prime, r = mdp.simulate(state1, a) nextstate = s_prime rewards_s_a_sprime[state1][a][s_prime] += r R_s_a[state][act] = ( r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) N_s_a_sprime[state1][a][s_prime] += 1 ## Calculating Q and V values for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) # Calculations for QupperMBAE and QlowerMBAE #### This involved a two for-loop and iterating convergence for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state][act] ) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (iteration**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): # print "Stopping with ", internal, "iterations" break count = 0 if (iteration % 10000 == 0): print iteration, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) / epsilon np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') # print QupperMBAE # print iteration #### Check epsilon condition for all the states # for st in range(mdp.numStates): # acList = bestTwoActions(mdp, st, Qstar, QupperMBAE, Qstar) # # print "Comparing ",QupperMBAE[st][acList[1]], QlowerMBAE[st][acList[0]] # if(QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]<=epsilon): # # print "Setting action ", acList[0], "for state ", st # final_policy[st]=acList[0] # count+=1 ##### Updating the list of coliliding states states_to_sample = [] for st in range(mdp.numStates): acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar) # colliding_values[st] = QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]-epsilon ##### Changing stopping condition to epsilon*(1-gamma)/2 colliding_values[st] = Qupper[st][acList[1]] - Qlower[st][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 # print colliding_values[st] if (colliding_values[st] > 0): ### this state is still colliding, add to sample states states_to_sample.append(st) #### Check epsilon condition for only starting state if (not (start_state in states_to_sample)): # if(count==mdp.numStates): acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) print "Setting final_policy of ", start_state, " to", acList[0] final_policy[start_state] = acList[0] print "Iterations taken : ", iteration print "Returning the policy :", final_policy for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper, Qstar)[0] return final_policy h += 1 for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper, Qstar)[0] return final_policy