def markovchainesti(mdp, start_state=0, epsilon=4, randomseed=None, algo="episodic", delta=0.1, bounds="MBAE"): if(randomseed is not None): np.random.seed(randomseed) policies = np.array(getPolicies(mdp.numStates, mdp.numActions)) numPolicies = len(policies) print("Total policies: ", numPolicies) H = int(math.log(epsilon/(2*mdp.Vmax*(1 - mdp.discountFactor)))/math.log(mdp.discountFactor)) print("Chosen value of H is : ", H) ## Initializations it = 0 samples = 0 initial_iterations = 1 * mdp.numStates * mdp.numActions R_s_a = np.zeros((mdp.numStates, mdp.numActions)) R_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int) N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int) P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) Qupper = mdp.Vmax*np.ones((numPolicies, mdp.numStates)) QupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates)) Qlower = np.zeros((numPolicies, mdp.numStates)) Qstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates)) QstarMBAE = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates)) QlowerMBAE = np.zeros((numPolicies, mdp.numStates)) V_true = np.zeros(mdp.numStates) V_estimate = np.zeros(mdp.numStates) V_error = np.zeros(MAX_ITERATION_LIMIT + 1) P_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates)) P_lower_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates)) VlowerMBAE = np.zeros((numPolicies, mdp.numStates)) VupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates)) Vstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates)) discovered_states = set([start_state]) deltadeltaV = np.zeros((mdp.numStates)) state_dist = np.zeros((mdp.numStates)) state_dist[start_state] = 1 print(state_dist) while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it = it + 1 ss, rr = mdp.simulate(state, act) print("Sampling", state, act, rr, ss) R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act])/(N_s_a[state][act] + 1) R_s_a_sprime[state][act][ss] = rr N_s_a[state][act] = N_s_a[state][act] + 1 N_s_a_sprime[state][act][ss] = N_s_a_sprime[state][act][ss] + 1 for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] samples += initial_iterations if(algo=="use_ddv"): ff = open(f"logs/{mdp.filename}-markovddv{randomseed}.txt", 'w') elif(algo=="episodic"): ff = open(f"logs/{mdp.filename}-markoveps{randomseed}.txt", 'w') elif(algo=="uniform"): ff = open(f"logs/{mdp.filename}-markovuni{randomseed}.txt", 'w') elif(algo=="greedyMBAE"): ff = open(f"logs/{mdp.filename}-markovMBAE{randomseed}.txt", 'w') elif(algo=="greedyMBIE"): ff = open(f"logs/{mdp.filename}-markovMBIE{randomseed}.txt", 'w') elif(algo=="mybest"): ff = open(f"logs/{mdp.filename}-markovbest{randomseed}.txt", 'w') elif(algo=="runcertainty"): ff = open(f"logs/{mdp.filename}-markovruncertainty{randomseed}.txt", 'w') elif(algo=="unc_contri"): ff = open(f"logs/{mdp.filename}-markovunc_contri{randomseed}.txt", 'w') while samples<MAX_ITERATION_LIMIT/2: p = 0 current_policy = fixedPolicy for i in range(mdp.numStates): # print "For state ", i, " doing UpperP" if(N_s_a[i][current_policy[i]]>0): P_tilda[p][i] = UpperP( i, current_policy[i], delta, N_s_a_sprime[i][current_policy[i]], mdp.numStates, Qupper[p], False ) P_lower_tilda[p][i] = LowerP( i, current_policy[i], delta, N_s_a_sprime[i][current_policy[i]], mdp.numStates, Qlower[p], False ) Qupper[p] = itConvergencePolicy( Qupper[p], getRewards(R_s_a, current_policy), P_tilda[p], mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence ) Qlower[p] = itConvergencePolicy( Qlower[p], getRewards(R_s_a, current_policy), P_lower_tilda[p], mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence ) Qstar[p] = itConvergencePolicy( Qstar[p], getRewards(R_s_a, current_policy), getProb(P_s_a_sprime, current_policy), mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence ) for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state]) for state in range(mdp.numStates): act = current_policy[state] firstterm = R_s_a[state][act] secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act])) lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act])) star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act])) thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act]) QupperMBAE[p][state] = firstterm + secondterm + thirdterm QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm QstarMBAE[p][state] = firstterm + star_secondterm VupperMBAE[p][state] = QupperMBAE[p][state] VlowerMBAE[p][state] = QlowerMBAE[p][state] Vstar[p][state] = QstarMBAE[p][state] if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence): break policy1Index = 0 h = 0 policy1 = fixedPolicy state = start_state # print "samples", samples if (samples%10000)<100: if(verbose==0): ff.write(str(samples)) ff.write('\t') if(plot_vstar): ff.write(str(Vstar[policy1Index][start_state])) else: ff.write(str(QupperMBAE[policy1Index][start_state]-QlowerMBAE[policy1Index][start_state]))#-epsilon*(1-mdp.discountFactor)/2 print(samples, QupperMBAE[policy1Index][start_state]-QlowerMBAE[policy1Index][start_state]) ff.write('\n') else: print(samples) print(QupperMBAE[:,start_state], QlowerMBAE[:,start_state]) polList = [policy1Index] # print(R_s_a.shape) # print(mdp.rewards) # print(mdp.transitionProbabilities) # print(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities)) # print(getRewards(R_s_a, current_policy).shape) # print(getRewards(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities), current_policy).shape) # print(mdp.transitionProbabilities.shape) # print(P_s_a_sprime.shape) # print(getProb(mdp.transitionProbabilities, current_policy)) # print(getProb(P_s_a_sprime, current_policy)) # V_true = itConvergencePolicy(V_true, # getRewards(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities), current_policy), # getProb(mdp.transitionProbabilities, current_policy), # mdp.discountFactor, # epsilon, # converge_iterations, # epsilon_convergence # ) # V_estimate = itConvergencePolicy(V_estimate, # getRewards(R_s_a, current_policy), # getProb(P_s_a_sprime, current_policy), # mdp.discountFactor, # epsilon, # converge_iterations, # epsilon_convergence # ) #print(V_estimate) V_true = itConvergencePolicy(V_true, getRewards(getAverageRewards(mdp.numStates, mdp.numActions, mdp.rewards, mdp.transitionProbabilities), current_policy), getProb(mdp.transitionProbabilities, current_policy), mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence ) if(algo=="use_ddv"): ## Caclulate V for all states for pnum in polList: policiesfddv = fixedPolicy # print "Getting DDV values" for st in list(discovered_states): ac = policiesfddv[st] #### Compute del del V deltadeltaV[st] = CalculateDelDelV( st, ac, mdp, N_s_a_sprime, QupperMBAE[pnum], QlowerMBAE[pnum], None, None, start_state, P_s_a_sprime, P_tilda[pnum], P_lower_tilda[pnum], R_s_a, epsilon, delta, converge_iterations, epsilon_convergence, policiesfddv ) # print deltadeltaV cs = np.argmax(deltadeltaV) ca = policiesfddv[cs] # print deltadeltaV, cs, ca # print deltadeltaV, policy1, policy2 # print "Found max state for DDV: ",cs,ca # time.sleep(0.1) ss, rr = mdp.simulate(cs, ca) # print "Policy is ", policiesfddv # print "Sampling ", cs, ca time.sleep(0.1) samples = samples + 1 discovered_states.add(ss) R_s_a[cs][ca] = (rr + R_s_a[cs][ca]*N_s_a[cs][ca])/(N_s_a[cs][ca]+1) N_s_a[cs][ca] += 1 N_s_a_sprime[cs][ca][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[cs][ca][s2] = (float)(N_s_a_sprime[cs][ca][s2])/N_s_a[cs][ca] elif(algo == "episodic"): while h<H: act = policy1[state] # print "------>",current_state, current_action ss, rr = mdp.simulate(state, act) # print "Sampling ", state, act samples+=1 R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] state = ss h+=1 V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence) elif(algo == "uniform"): for st in range(mdp.numStates): ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac] V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence) elif(algo == "runcertainty"): deltaW = np.zeros(mdp.numStates) mu = np.zeros(mdp.numStates) D = np.zeros(mdp.numStates) mu[start_state] = 1 for t in range(H): D = D + (mdp.discountFactor**t) * mu mu = prob_step(mu, P_s_a_sprime, fixedPolicy) for st in range(mdp.numStates): #transition uncertainty for given s, pi(s) deltaW[st] = delW(st, fixedPolicy[st], delta, N_s_a_sprime[st][fixedPolicy[st]], mdp.numStates, False) st = np.argmax(deltaW * D) # if samples % 100 == 0: # print deltaW, D, deltaW * D, np.argmax(deltaW * D) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac, rr, ss samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac] V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence) elif(algo == "unc_contri"): mu = np.zeros(mdp.numStates) D = np.zeros(mdp.numStates) z_quantile = 2.0 mu[start_state] = 1 transitionEstimate = getProb(P_s_a_sprime, fixedPolicy) rewardEstimate = getRewards(R_s_a, fixedPolicy) for t in range(H): D = D + (mdp.discountFactor**t) * mu mu = np.dot(mu, transitionEstimate) V_esti = np.dot(rewardEstimate, D) V_uncertainty = np.zeros(mdp.numStates) for s in range(mdp.numStates): transitionEstimateUpper = np.copy(transitionEstimate) transitionEstimateLower = np.copy(transitionEstimate) for sprime in range(mdp.numStates): #Wilson score interval confidence bounds for each transition muTerm = (transitionEstimate[s][sprime] + (z_quantile**2)/(2*N_s_a[s][fixedPolicy[s]])) / (1 + (z_quantile**2)/N_s_a[s][fixedPolicy[s]]) devTerm = (z_quantile / (1 + (z_quantile**2)/N_s_a[s][fixedPolicy[s]])) * math.sqrt( (transitionEstimate[s][sprime]*(1-transitionEstimate[s][sprime])/N_s_a[s][fixedPolicy[s]]) + (z_quantile**2)/(4*N_s_a[s][fixedPolicy[s]]**2)) # print(transitionEstimate[s][sprime], muTerm, devTerm) # if s == 1: # print "mu", muTerm # print "dev", devTerm # print (z_quantile / (1 + (z_quantile**2)/N_s_a[s][fixedPolicy[s]])) # print z_quantile, N_s_a[s][fixedPolicy[s]] # print (transitionEstimate[s][sprime]*(1-transitionEstimate[s][sprime])/N_s_a[s][fixedPolicy[s]]) # print (z_quantile**2)/(4*N_s_a[s][fixedPolicy[s]]**2) transitionEstimateUpper[s][sprime] = muTerm + devTerm transitionEstimateLower[s][sprime] = muTerm - devTerm #print(transitionEstimateUpper[s][sprime], transitionEstimateLower[s][sprime] ) #print("_____________") # if samples > 49500: # print(s, N_s_a[s][fixedPolicy[s]]) # print(transitionEstimate) # print(transitionEstimateUpper) # print(transitionEstimateLower) upperD = np.zeros(mdp.numStates) lowerD = np.zeros(mdp.numStates) uppermu = np.zeros(mdp.numStates) uppermu[start_state] = 1 lowermu = np.zeros(mdp.numStates) lowermu[start_state] = 1 for t in range(H): upperD = upperD + (mdp.discountFactor**t) * uppermu uppermu = np.dot(uppermu, transitionEstimateUpper) lowerD = lowerD + (mdp.discountFactor**t) * lowermu lowermu = np.dot(lowermu, transitionEstimateLower) # if samples > 49500: # print("___________") # print(upperD) # print(lowerD) # print(rewardEstimate) # print(np.dot(rewardEstimate, upperD) - np.dot(rewardEstimate, lowerD)) # print(V_uncertainty) V_uncertainty[s] = abs(np.dot(rewardEstimate, upperD) - np.dot(rewardEstimate, lowerD)) # if samples > 49500: # print("V_unc", V_uncertainty) st = np.argmax(V_uncertainty) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # if samples > 49500: # print(f"Sample no. {samples}", st, ac, rr, ss) samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac] V_error[samples] = ErrorV(mdp, V_true, V_estimate, R_s_a, P_s_a_sprime, current_policy, start_state, epsilon, converge_iterations, epsilon_convergence) # if samples > 49500: # print(f"Error = {V_error[samples]}") # if V_error[samples]/V_true[start_state] > -0.5 : # print(samples) # if V_error[samples]/V_true[start_state] > 0.35 and V_error[samples]> V_error[samples-1]: # print samples elif(algo == "greedyMBAE"): st = max(range(mdp.numStates), key=lambda x: VupperMBAE[0][x]-VlowerMBAE[0][x]) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac] elif(algo == "greedyMBIE"): st = max(range(mdp.numStates), key=lambda x: Qupper[0][x]-Qlower[0][x]) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac] elif(algo == "mybest"): if(samples%10000<50): state_dist = np.zeros((mdp.numStates)) state_dist[start_state] = 1 N = getSampleCount(state_dist, N_s_a_sprime, QupperMBAE[policy1Index], QlowerMBAE[policy1Index], QstarMBAE[policy1Index]) # print N for i in range(N): # print state_dist, samples, P_s_a_sprime # import pdb; pdb.set_trace() st = np.random.choice(np.arange(mdp.numStates), p=state_dist) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac]*N_s_a[st][ac])/(N_s_a[st][ac]+1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)(N_s_a_sprime[st][ac][s2])/N_s_a[st][ac] state_dist = prob_step(state_dist, P_s_a_sprime, fixedPolicy) if (samples%1000)<100: if(QupperMBAE[policy1Index][start_state]-QlowerMBAE[policy1Index][start_state]-epsilon*(1-mdp.discountFactor)/2<0): print(Qupper[policy1Index][start_state],Qstar[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2) print("Epsilon condition reached at ",samples, " samples") return fixedPolicy else: # print QupperMBAE[policy2Index][start_state],QstarMBAE[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2 pass # print "ends here" print(mdp.numStates, mdp.numActions) # plt.plot(1 + np.arange(MAX_ITERATION_LIMIT//2)[mdp.numStates * mdp.numActions + 500:], V_error[mdp.numStates * mdp.numActions + 500: MAX_ITERATION_LIMIT//2]/ V_true[start_state]) # plt.title('Uniform Sampling') # plt.xlabel('samples') # plt.ylabel('Error fraction in value function') # plt.show() print(algo, " ", V_true) print(algo, " ", V_estimate) ff.close() return V_error/V_true[start_state]
def FeichterPolicy(mdp, start_state=0, epsilon=1, randomseed=None, delta=0.1): global c if (randomseed is not None): np.random.seed(randomseed) # orig_stdout = sys.stdout # f = open('Fiechter-m01.txt', 'w') # sys.stdout = f ##### Initialisation print(mdp.Vmax, 6 / epsilon, mdp.discountFactor) H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) / (1 - mdp.discountFactor)) print("Chosen value of H is : ", H) N_h_s_a = np.zeros((H, mdp.numStates, mdp.numActions)) N_h_s_a_s_prime = np.zeros( (H, mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int) rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) P_h_s_a_s_prime = np.zeros( (H, mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) policy_h_s = np.zeros((H, mdp.numStates), dtype=np.int) d_h_policy_s = np.zeros((H + 1, mdp.numStates)) dmax = 12 * mdp.Vmax / (epsilon * (1 - mdp.discountFactor)) converge_iterations = 10000 epsilon_convergence = 1e-4 Qlower = np.zeros((mdp.numStates, mdp.numActions)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions]) QstarMBAE = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.random.random([mdp.numStates]) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) it = 0 samples = 0 initial_iterations = 1 * mdp.numStates * mdp.numActions ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 #### For starting the while loop below iteration = 1 if (verbose == 0): outp = open(mdp.filename + '-fiechter' + str(randomseed) + '.txt', 'wb') # sys.stdout = open(mdp.filename+'-fiechter.txt', 'w+') ff = open(mdp.filename + '-fiechter-samples.txt', 'w+') #### Exploration # while d_h_policy_s[0][start_state]>2/(1-mdp.discountFactor) or iteration==1: acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) coll = Qupper[start_state][acList[1]] - Qlower[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 while coll > 0 or iteration < 50: # print d_h_policy_s[0][start_state], " > ", 2/(1-mdp.discountFactor) # print policy_h_s[0] h = 0 current_state = start_state while h < H: current_action = policy_h_s[h][current_state] # print "------>",current_state, current_action s_prime, r = mdp.simulate(current_state, current_action) N_h_s_a[h][current_state][current_action] += 1 rewards_s_a_sprime[current_state][current_action][s_prime] += r R_s_a[state][act] = ( r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) N_h_s_a_s_prime[h][current_state][current_action][s_prime] += 1 N_s_a_sprime[current_state][current_action][s_prime] += 1 sampled_frequency_s_a[current_state][current_action] += 1 for s2 in range(mdp.numStates): P_h_s_a_s_prime[h][current_state][current_action][ s2] = N_h_s_a_s_prime[h][current_state][current_action][ s2] / N_h_s_a[h][current_state][current_action] h += 1 current_state = s_prime samples += 1 if (samples % 100 == 0): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, QstarMBAE) if (verbose == 0): outp.write(str(samples)) outp.write('\t') outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) ) #-epsilon*(1-mdp.discountFactor)/2 outp.write('\n') else: print(Qupper[start_state], Qlower[start_state]) # print d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor) # print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])-epsilon*(1-mdp.discountFactor)/2 np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') # print samples, d_h_policy_s[0][start_state]-2/(1-mdp.discountFactor) # Compute new policy dynamic program e_s_a = np.zeros((mdp.numStates, mdp.numActions)) for h in range(H - 1, -1, -1): for state in range(mdp.numStates): current_max = -float("inf") argmax_action = -1 for act in range(mdp.numActions): if (N_h_s_a[h][state][act] == 0): e_s_a[state][act] = dmax else: sqterm = (2 * math.log( 4 * H * mdp.numStates * mdp.numActions) - 2 * math.log(delta)) / N_h_s_a[h][state][act] summation = np.sum( (N_h_s_a_s_prime[h][state][act] / N_h_s_a[h][state][act]) * d_h_policy_s[h + 1]) secondterm = mdp.discountFactor * summation e_s_a[state][act] = min( dmax, 6 * mdp.Vmax * (math.sqrt(sqterm)) / (epsilon * (1 - delta)) + secondterm) policy_h_s[h][state] = np.argmax(e_s_a[state]) d_h_policy_s[h][state] = np.amax(e_s_a[state]) # Compute MBAE QupperMBAE and QlowerMBAE bounds for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state][act] ) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (samples**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm QstarMBAE[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(QstarMBAE[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): break for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) # Qstar, _ = iteratedConvergence(Qstar,R_s_a,P_,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) iteration += 1 acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, QstarMBAE) coll = QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 # sys.stdout = orig_stdout # f.close() print(iteration) a = open('final' + mdp.filename + '-fiechter.txt', 'a+') a.write(str(iteration) + '\n') a.close() return getBestPolicy(mdp, rewards_s_a_sprime, P_h_s_a_s_prime[0]) # return policy_h_s[0]
def mbie(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1): global c if(randomseed is not None): np.random.seed(randomseed) initial_iterations = 1*mdp.numStates*mdp.numActions ### Estimate the horizon based on Fiechter H = int((math.log(mdp.Vmax) + math.log(6.0/epsilon))/(1-mdp.discountFactor)) it=0 samples = 0 ### Calculating m based on the parameters first_term = mdp.numStates/(epsilon**2*(1-mdp.discountFactor)**4) second_term = math.log(mdp.numStates*mdp.numActions/(epsilon*(1-mdp.discountFactor)*delta))/(epsilon**2*(1-mdp.discountFactor)**4) m = c*(first_term+second_term) print "Chosen value of m is :",H, m N_s_a = np.zeros((mdp.numStates,mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) P_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) P_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) R_s_a = np.zeros((mdp.numStates,mdp.numActions)) Qupper = mdp.Vmax*np.random.random([mdp.numStates,mdp.numActions]) QupperMBAE = mdp.Vmax*np.ones((mdp.numStates,mdp.numActions)) Qlower = np.zeros((mdp.numStates,mdp.numActions)) QlowerMBAE = np.zeros((mdp.numStates,mdp.numActions)) Qstar = (mdp.Vmax/2)*np.ones((mdp.numStates,mdp.numActions)) Vupper = mdp.Vmax*np.random.random([mdp.numStates]) VupperMBAE = mdp.Vmax*np.ones((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax/2)*np.ones((mdp.numStates)) best_policy = (-1)*np.ones((mdp.numStates), dtype=np.int) ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it+=1 ss, rr = mdp.simulate(state, act) R_s_a[state][act] = rr N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] samples += initial_iterations print P_s_a_sprime print "Completed initial iterations" Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_s_a_sprime,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) print Qupper, "Qupper" # print Qupper, Vupper current_state = start_state ### Repeat forever if(verbose==0): outp = open(mdp.filename+'-mbie' + str(randomseed) +'.txt', 'wb') # sys.stdout = open(mdp.filename+'-mbie.txt', 'w+') ff = open(mdp.filename+'-mbie-samples.txt', 'w+') while samples<MAX_ITERATION_LIMIT: current_state = start_state h=1 # print Qupper[start_state], Qstar[start_state], Qlower[start_state] while h<=H: if(samples%100==0): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) if(verbose==0): outp.write(str(samples)) outp.write('\t') outp.write(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]))#-epsilon*(1-mdp.discountFactor)/2 outp.write('\n') print samples, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]]) else: print samples, (QupperMBAE[start_state][acList[1]],QlowerMBAE[start_state][acList[0]]) pass np.savetxt(ff, N_s_a, delimiter=',') ff.write('\n') for i in range(mdp.numStates): # print "For state ", i, " doing UpperP" for j in range(mdp.numActions): P_tilda[i][j] = UpperP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vupper,False) P_lower_tilda[i][j] = LowerP(i,j,delta,N_s_a_sprime[i][j],mdp.numStates,Vlower,False) # print "Starting iterating" # print Qupper # return 2 Qupper, Vupper = iteratedConvergence(Qupper,R_s_a,P_tilda,mdp.discountFactor,epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower,R_s_a,P_lower_tilda,mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) current_action = np.argmax(QupperMBAE[current_state]) # print Qupper[start_state], Qlower[start_state] best_policy[current_state] = current_action if(N_s_a[current_state][current_action]<m): for t in range(1): ss,rr = mdp.simulate(current_state, current_action) R_s_a[current_state][current_action] = (rr + R_s_a[current_state][current_action]*N_s_a[current_state][current_action])/(N_s_a[current_state][current_action]+1) N_s_a[current_state][current_action] += 1 N_s_a_sprime[current_state][current_action][ss] += 1 samples += 1 for s2 in range(mdp.numStates): # print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action] P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] current_state = ss else: print "TRUEEEE" print N_s_a[current_state] # print P_s_a_sprime[current_state][current_action] # print np.sum(P_s_a_sprime[current_state][current_action]) # print N_s_a[current_state][current_action] current_state = np.random.choice(np.arange(mdp.numStates), p=P_s_a_sprime[current_state][current_action]/np.sum(P_s_a_sprime[current_state][current_action])) h += 1 # Compute MBAE Qupper and Qlower bounds for internal in range(converge_iterations): oldQlower = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for Qupper and Qlower mBAE firstterm = R_s_a[state][act] secondterm = mdp.discountFactor*np.sum(VupperMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act])) #secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE*(N_s_a_sprime[state][act]/N_s_a[state][act])) star_secondterm = mdp.discountFactor*np.sum(Vstar*(N_s_a_sprime[state][act]/N_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*mdp.numActions)-math.log(delta))/N_s_a[state][act]) #Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm # print firstterm, secondterm, thirdterm QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if(np.linalg.norm(oldQlower-QlowerMBAE[start_state])<=epsilon_convergence): # print "Stopping with ", internal, "iterations" break return best_policy
def LUCBEpisodic(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1, fileprint=1): if (randomseed is not None): np.random.seed(randomseed) global MAX_ITERATION_LIMIT, c iteration = 0 it = 0 H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) / (1 - mdp.discountFactor)) initial_iterations = 1 * mdp.numStates * mdp.numActions rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.random.random([mdp.numStates]) Qlower = np.zeros((mdp.numStates, mdp.numActions)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions]) final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) states_to_sample = range(mdp.numStates) colliding_values = np.zeros((mdp.numStates)) is_converged = 0 print "Vmax", mdp.Vmax print "Epsilon is ", epsilon ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 for s2 in range(mdp.numStates): P[state][act][s2] = (float)( N_s_a_sprime[state][act] [s2]) / sampled_frequency_s_a[state][act] ### Calculating V, Q estimates thus far MBAE for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): print "Stopping with ", internal, "initial internal iterations" break if internal == converge_iterations: print "Used all iterations" print "Initial estimate of QupperMBAE found! Now sampling" Qupper = np.copy(QupperMBAE) Qlower = np.copy(QlowerMBAE) if (verbose == 0): outp = open(mdp.filename + '-lucbeps' + str(randomseed) + '.txt', 'wb') ff = open(mdp.filename + '-lucbeps-samples.txt', 'w+') h = 0 state1 = start_state iteration += initial_iterations while iteration < MAX_ITERATION_LIMIT: max_collision_state = [ sorted(states_to_sample, key=lambda x: colliding_values[x], reverse=True)[0] ] if (h % H == 0): state1 = start_state h = 0 else: state1 = nextstate actionsList = bestTwoActions(mdp, state1, QlowerMBAE, QupperMBAE, Qstar) a = np.random.choice(actionsList) iteration += 1 for t in range(1): s_prime, r = mdp.simulate(state1, a) nextstate = s_prime rewards_s_a_sprime[state1][a][s_prime] += r R_s_a[state1][act] = (r + R_s_a[state1][act] * sampled_frequency_s_a[state1][act]) / ( sampled_frequency_s_a[state1][act] + 1) sampled_frequency_s_a[state1][a] += 1 N_s_a_sprime[state1][a][s_prime] += 1 if (verbose == 1): pass # print "s, a, sprime" # print state1, a, s_prime for s2 in range(mdp.numStates): P[state1][act][s2] = (float)( N_s_a_sprime[state1][act] [s2]) / sampled_frequency_s_a[state1][act] ## Calculating Q and V values for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) if (verbose == 1): pass Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) if (verbose == 1): # print "Calculated Q values are :" print QupperMBAE[start_state], Qstar[start_state], QlowerMBAE[ start_state] # Calculations for QupperMBAE and QlowerMBAE #### This involved a two for-loop and iterating convergence for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state][act] ) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (iteration**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): break count = 0 if (iteration % 100 == 0): acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) # print "Qupper, Qstar, Qlower" # print Qupper[start_state], Qstar[start_state], Qlower[start_state] if (verbose == 0): outp.write(str(iteration)) outp.write('\t') outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) ) #-epsilon*(1-mdp.discountFactor)/2 outp.write('\n') else: print iteration, QupperMBAE[start_state][ acList[1]] - QlowerMBAE[start_state][acList[0]] np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') ##### Updating the list of coliliding states if (iteration > 50): states_to_sample = [] for st in range(mdp.numStates): acList = bestTwoActions(mdp, st, QlowerMBAE, QupperMBAE, Qstar) ##### Changing stopping condition to epsilon*(1-gamma)/2 colliding_values[st] = QupperMBAE[st][acList[1]] - QlowerMBAE[ st][acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 if (colliding_values[st] > 0): ### this state is still colliding, add to sample states states_to_sample.append(st) else: # for st in range(mdp.numStates): # acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar) # colliding_values[st] = Qupper[st][acList[1]]-Qlower[st][acList[0]]-epsilon*(1-mdp.discountFactor)/2 colliding_values = range(mdp.numStates) states_to_sample = range(mdp.numStates) #### Check epsilon condition for only starting state if (not (start_state in states_to_sample) and iteration > 50): # if(count==mdp.numStates): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) print "Difference is ", Qupper[st][acList[1]] - Qlower[st][ acList[0]] print "Setting final_policy of ", start_state, " to", acList[0] final_policy[start_state] = acList[0] print "Iterations taken : ", iteration for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] print "Returning policy : ", final_policy if (iteration != 51): a = open('final' + mdp.filename + '-lucbeps.txt', 'a+') a.write(str(iteration) + '\n') a.close() return final_policy h += 1 outp.close() ff.close() for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] return final_policy
def ddvouu(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1): if (randomseed is not None): np.random.seed(randomseed) initial_iterations = 1 * mdp.numStates * mdp.numActions ### Estimate the horizon based on Fiechter c = 1 it = 0 samples = 0 first_term = mdp.numStates / (epsilon**2 * (1 - mdp.discountFactor)**4) second_term = math.log( mdp.numStates * mdp.numActions / (epsilon * (1 - mdp.discountFactor) * delta)) / (epsilon**2 * (1 - mdp.discountFactor)**4) m = c * (first_term + second_term) delta = delta / (mdp.numStates * mdp.numActions * m) print("Chosen value of m is :", m) N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int) P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qlower = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Vupper = mdp.Vmax * np.ones((mdp.numStates)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) best_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) deltadeltaV = np.zeros((mdp.numStates, mdp.numActions)) discovered_states = set([start_state, 1, 2, 3, 4]) ## Initial sampling for all state action pairs ### Is this needed? while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 ss, rr = mdp.simulate(state, act) print("Sampling ", state, act, rr, ss) R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act] ) / (N_s_a[state][act] + 1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)( N_s_a_sprime[state][act][s2]) / N_s_a[state][act] samples += initial_iterations print(P_s_a_sprime) print("Completed initial iterations") if (verbose == 0): outp = open(mdp.filename + '-ddv' + str(randomseed) + '.txt', 'wb') # sys.stdout = open(mdp.filename+'-ddv.txt', 'w+') ff = open(mdp.filename + '-ddv-samples.txt', 'w+') # print Qupper, Vupper current_state = start_state ### Repeat forever while samples < MAX_ITERATION_LIMIT: # print Qupper[start_state], Qlower[start_state] for i in range(mdp.numStates): for j in range(mdp.numActions): if (N_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) ##Calculate Q values Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) current_state = start_state ### Terminating condition if (use_mbae): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) coll = QupperMBAE[start_state][ acList[1]] - QlowerMBAE[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 else: acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) coll = Qupper[start_state][acList[1]] - Qlower[start_state][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 # if(Vupper[start_state]-Vlower[start_state]<=epsilon and samples>50): if (coll < 0 and samples > 50): a = open('final' + mdp.filename + '-ddv.txt', 'a+') a.write(str(samples) + '\n') a.close() print(Qupper[start_state], Vupper[start_state], Vlower[start_state]) policy_lower = np.argmax(Qlower, axis=1) print("Iteration number ", samples) print("Returning policy because of epsilon-convergence") print(policy_lower) print(np.argmax(QupperMBAE, axis=1)) print(np.argmax(Qupper, axis=1)) print(np.argmax(QlowerMBAE, axis=1)) print(np.argmax(Qstar, axis=1)) return policy_lower ## Caclulate deldelV for all states if (use_mbae): for st in list(discovered_states): for ac in range(mdp.numActions): #### Compute del del V deltadeltaV[st][ac] = CalculateDelDelV( st, ac, mdp, N_s_a_sprime, QupperMBAE, QlowerMBAE, VupperMBAE, VlowerMBAE, start_state, P_s_a_sprime, P_tilda, P_lower_tilda, R_s_a, epsilon, delta, converge_iterations, epsilon_convergence) else: for st in list(discovered_states): for ac in range(mdp.numActions): #### Compute del del V deltadeltaV[st][ac] = CalculateDelDelV( st, ac, mdp, N_s_a_sprime, Qupper, Qlower, Vupper, Vlower, start_state, P_s_a_sprime, P_tilda, P_lower_tilda, R_s_a, epsilon, delta, converge_iterations, epsilon_convergence) #### Simulate greedily wrt deldelV # print np.unravel_index(deltadeltaV.argmax(), deltadeltaV.shape) current_state, current_action = np.unravel_index( deltadeltaV.argmax(), deltadeltaV.shape) #time.sleep(0.1) print(deltadeltaV) ss, rr = mdp.simulate(current_state, current_action) samples += 1 print("Sampling ", current_state, current_action, rr, ss) #### Add received state to the set of discovered states #discovered_states.add(ss) print(discovered_states) ### Update believed model R_s_a[current_state][current_action] = ( rr + R_s_a[current_state][current_action] * N_s_a[current_state][current_action]) / ( N_s_a[current_state][current_action] + 1) N_s_a[current_state][current_action] += 1 N_s_a_sprime[current_state][current_action][ss] += 1 for s2 in range(mdp.numStates): # print current_state, current_action, s2, N_s_a_sprime[current_state][current_action][s2], N_s_a[current_state][current_action] P_s_a_sprime[current_state][current_action][s2] = (float)( N_s_a_sprime[current_state][current_action] [s2]) / N_s_a[current_state][current_action] if (samples % 100 == 0): if (use_mbae): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) else: acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) if (verbose == 0): outp.write(str(samples)) outp.write('\t') if (plot_vstar): outp.write(str(Vstar[start_state])) else: if (use_mbae): outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) else: outp.write( str(Qupper[start_state][acList[1]] - Qlower[start_state][acList[0]])) outp.write('\n') if (use_mbae): print(samples, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) else: print(samples, (Qupper[start_state][acList[1]] - Qlower[start_state][acList[0]])) else: print(samples, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) np.savetxt(ff, N_s_a, delimiter=',') ff.write('\n') ### Calculating MBAE bounds for internal in range(converge_iterations): oldQlower = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for Qupper and Qlower firstterm = R_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / N_s_a[state][act])) #secondterm = mdp.discountFactor*sum(Vupper[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / N_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / N_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(Vlower[ss]*N_s_a_sprime[state][act][ss]/N_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (samples**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / N_s_a[state][act]) #Qupper[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/N_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/N_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlower - QlowerMBAE[start_state]) <= epsilon_convergence): # print "Stopping with ", internal, "iterations" break # if(samples==initial_iterations+2): # Qupper = np.copy(QupperMBAE) # Qlower = np.copy(QlowerMBAE) return best_policy
def policyIt(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1, bounds="MBAE", use_ddv=False, mc = True): if(randomseed is not None): np.random.seed(randomseed) policies = np.array(getPolicies(mdp.numStates, mdp.numActions)) numPolicies = len(policies) counts = np.zeros((numPolicies)) print(numPolicies) #H = int((math.log(mdp.Vmax) + math.log(6.0/epsilon))/(1-mdp.discountFactor)) H = int(math.log(epsilon/(2*mdp.Vmax*(1 - mdp.discountFactor)))/math.log(mdp.discountFactor)) print("Chosen value of H is : ", H) ## Initializations it = 0 samples = 0 initial_iterations = 1*mdp.numStates*mdp.numActions R_s_a = np.zeros((mdp.numStates,mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates), dtype=np.int) N_s_a = np.zeros((mdp.numStates,mdp.numActions), dtype=np.int) P_s_a_sprime = np.zeros((mdp.numStates,mdp.numActions,mdp.numStates)) Qupper = mdp.Vmax*np.ones((numPolicies, mdp.numStates)) QupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates)) Qlower = np.zeros((numPolicies, mdp.numStates)) Qstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates)) QstarMBAE = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates)) QlowerMBAE = np.zeros((numPolicies, mdp.numStates)) P_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates)) P_lower_tilda = np.zeros((numPolicies, mdp.numStates,mdp.numStates)) VlowerMBAE = np.zeros((numPolicies, mdp.numStates)) VupperMBAE = mdp.Vmax*np.ones((numPolicies, mdp.numStates)) Vstar = (mdp.Vmax/2)*np.ones((numPolicies, mdp.numStates)) discovered_states = set([start_state]) deltadeltaV = np.zeros((mdp.numStates)) #sampling all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it = it + 1 ss, rr = mdp.simulate(state, act) R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1) N_s_a[state][act] = N_s_a[state][act] + 1 N_s_a_sprime[state][act][ss] = N_s_a_sprime[state][act][ss] + 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] samples += initial_iterations if(use_ddv): ff = open(mdp.filename+'-policyddv' + str(randomseed) +'.txt', 'wb') else: ff = open(mdp.filename+'-policy' + str(randomseed) +'.txt', 'wb') while samples<MAX_ITERATION_LIMIT: # print counts if(policyMethod == 0): for p in range(numPolicies): # print "Policy Number : ", p current_policy = policies[p] for i in range(mdp.numStates): # print "For state ", i, " doing UpperP" if(N_s_a[i][current_policy[i]]>0): P_tilda[p][i] = UpperP( i, current_policy[i], delta, N_s_a_sprime[i][current_policy[i]], mdp.numStates, Qupper[p], False ) P_lower_tilda[p][i] = LowerP( i, current_policy[i], delta, N_s_a_sprime[i][current_policy[i]], mdp.numStates, Qlower[p], False ) #computing all three versions of Q given current knowlege of transition and reward matrices Qupper[p] = itConvergencePolicy( Qupper[p], getRewards(R_s_a, current_policy), P_tilda[p], mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence ) Qlower[p] = itConvergencePolicy( Qlower[p], getRewards(R_s_a, current_policy), P_lower_tilda[p], mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence ) Qstar[p] = itConvergencePolicy( Qstar[p], getRewards(R_s_a, current_policy), getProb(P_s_a_sprime, current_policy), mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence ) # import pdb; pdb.set_trace() # print "mbie bounds calculated!" for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state]) for state in range(mdp.numStates): # for act in range(mdp.numActions): act = current_policy[state] # Calculations for QupperMBAE and QlowerMBAE firstterm = R_s_a[state][act] # print VupperMBAE[p] secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act])) lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act])) star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act])) thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act]) QupperMBAE[p][state] = firstterm + secondterm + thirdterm QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm QstarMBAE[p][state] = firstterm + star_secondterm VupperMBAE[p][state] = QupperMBAE[p][state] VlowerMBAE[p][state] = QlowerMBAE[p][state] Vstar[p][state] = QstarMBAE[p][state] if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence): break # print VupperMBAE[p] # import pdb; pdb.set_trace() policy1Index = np.argmax(QstarMBAE[:,start_state]) policy2choices = QupperMBAE[:,start_state].argsort()[::-1] if(policy2choices[0]==policy1Index): policy2Index = policy2choices[1] else: policy2Index = policy2choices[0] # print "polivyiniex", QstarMBAE[:,start_state] #action switching policy iteration # elif(policyMethod==1): # # print "Choosing 2nd method for finding policy" # p = np.random.randint(0,numPolicies) # current_policy = policies[p] # while True: # for internal in range(converge_iterations): # oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state]) # for state in range(mdp.numStates): # # for act in range(mdp.numActions): # act = policies[p][state] # # Calculations for QupperMBAE and QlowerMBAE # firstterm = R_s_a[state][act] # secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act])) # lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act])) # star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act])) # thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act]) # QupperMBAE[p][state] = firstterm + secondterm + thirdterm # QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm # QstarMBAE[p][state] = firstterm + star_secondterm # VupperMBAE[p][state] = QupperMBAE[p][state] # VlowerMBAE[p][state] = QlowerMBAE[p][state] # Vstar[p][state] = QstarMBAE[p][state] # if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence): # break # hasChanged = False # for st in range(mdp.numStates): # for ac in range(mdp.numActions): # if(current_policy[st]==ac): # continue # else: # tempfirstterm = R_s_a[st][ac] # tempsecondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[st][ac])) # lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[st][ac])) # star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[st][ac])) # tempthirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[st][ac]) # tempQupperMBAE = tempfirstterm + tempsecondterm + tempthirdterm # tempQlowerMBAE = tempfirstterm + lower_secondterm - tempthirdterm # tempQstarMBAE = tempfirstterm + star_secondterm # tempVupperMBAE = tempQupperMBAE # tempVlowerMBAE = tempQlowerMBAE # tempVstar = tempQstarMBAE # if(tempVstar>Vstar[p][st]): # # if(tempVupperMBAE>VupperMBAE[p][st]): # current_policy[st] = ac # hasChanged = True # break # # if(hasChanged): # # break # if hasChanged: # p = indexOfPolicy(current_policy,mdp.numStates,mdp.numActions) # print "Changing to ",current_policy, p # else: # policy1Index = p # # print "Found first best policy!",policy1Index # break # p = np.random.randint(0,numPolicies) # current_policy = policies[p] # while True: # for internal in range(converge_iterations): # oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state]) # for state in range(mdp.numStates): # # for act in range(mdp.numActions): # act = policies[p][state] # # Calculations for QupperMBAE and QlowerMBAE # firstterm = R_s_a[state][act] # secondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[state][act])) # lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[state][act])) # star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[state][act])) # thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act]) # QupperMBAE[p][state] = firstterm + secondterm + thirdterm # QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm # QstarMBAE[p][state] = firstterm + star_secondterm # VupperMBAE[p][state] = QupperMBAE[p][state] # VlowerMBAE[p][state] = QlowerMBAE[p][state] # Vstar[p][state] = QstarMBAE[p][state] # if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p][start_state])<=epsilon_convergence): # break # hasChanged = False # for st in range(mdp.numStates): # for ac in range(mdp.numActions): # if(current_policy[st]==ac): # continue # else: # tempfirstterm = R_s_a[st][ac] # tempsecondterm = mdp.discountFactor*np.sum(VupperMBAE[p]*(P_s_a_sprime[st][ac])) # lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p]*(P_s_a_sprime[st][ac])) # star_secondterm = mdp.discountFactor*np.sum(Vstar[p]*(P_s_a_sprime[st][ac])) # tempthirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[st][ac]) # tempQupperMBAE = tempfirstterm + tempsecondterm + tempthirdterm # tempQlowerMBAE = tempfirstterm + lower_secondterm - tempthirdterm # tempQstarMBAE = tempfirstterm + star_secondterm # tempVupperMBAE = tempQupperMBAE # tempVlowerMBAE = tempQlowerMBAE # tempVstar = tempQstarMBAE # if(tempVupperMBAE>VupperMBAE[p][st]): # # if(tempVupperMBAE>VupperMBAE[p][st]): # current_policy[st] = ac # hasChanged = True # break # # if(hasChanged): # # break # if hasChanged: # p = indexOfPolicy(current_policy,mdp.numStates,mdp.numActions) # print "Changing to ",current_policy, p, "Vupper" # else: # policy3Index = p # # print "Found first best policy!",policy1Index # break # ### Hill Climbing for second policy # # print "Finding 2nd best policy" # # print VupperMBAE[:,start_state] # oneNeighbours = allOneNeighbours(policies[policy3Index], mdp.numActions) # maxVupper = -float("inf") # bestPolicyIndex = -1 # hasChanged = False # for p1 in oneNeighbours: # # print p1 # # print indexOfPolicy(p1,mdp.numStates,mdp.numActions) # p1Index = indexOfPolicy(p1,mdp.numStates,mdp.numActions) # for internal in range(converge_iterations): # oldQlowerMBAE = np.copy(QlowerMBAE[p1Index][start_state]) # for state in range(mdp.numStates): # # for act in range(mdp.numActions): # act = p1[state] # # Calculations for QupperMBAE and QlowerMBAE # firstterm = R_s_a[state][act] # secondterm = mdp.discountFactor*np.sum(VupperMBAE[p1Index]*(P_s_a_sprime[state][act])) # lower_secondterm = mdp.discountFactor*np.sum(VlowerMBAE[p1Index]*(P_s_a_sprime[state][act])) # star_secondterm = mdp.discountFactor*np.sum(Vstar[p1Index]*(P_s_a_sprime[state][act])) # thirdterm = mdp.Vmax*math.sqrt((math.log(c*(samples**2)*mdp.numStates*1)-math.log(delta))/N_s_a[state][act]) # QupperMBAE[p1Index][state] = firstterm + secondterm + thirdterm # QlowerMBAE[p1Index][state] = firstterm + lower_secondterm - thirdterm # QstarMBAE[p1Index][state] = firstterm + star_secondterm # VupperMBAE[p1Index][state] = QupperMBAE[p1Index][state] # VlowerMBAE[p1Index][state] = QlowerMBAE[p1Index][state] # Vstar[p1Index][state] = QstarMBAE[p1Index][state] # if(np.linalg.norm(oldQlowerMBAE-QlowerMBAE[p1Index][start_state])<=epsilon_convergence): # break # if(VupperMBAE[p1Index][start_state]>maxVupper): # bestPolicyIndex = p1Index # maxVupper = VupperMBAE[p1Index][start_state] # # print Vstar[0] # hasChanged = True # if hasChanged: # p = bestPolicyIndex # policy2Index = bestPolicyIndex # print "Second best policy ", policy2Index #policy iteration methods end h=0 policy1 = policies[policy1Index] policy2 = policies[policy2Index] # print QlowerMBAE # print policy2 # print QstarMBAE[:,start_state] state = start_state if (samples%1000)<100: if(verbose==0): # print QupperMBAE[:,start_state] # print Qstar[:,start_state] ff.write(str(samples)) ff.write('\t') if(plot_vstar): # ff.write(str(Vstar[policy1Index][start_state])) ff.write(str(evaluatePolicy(mdp, policy1, start_state))) print(evaluatePolicy(mdp, policy1, start_state)) print(policy1, policy2) else: ff.write(str(QupperMBAE[policy2Index][start_state]-QlowerMBAE[policy1Index][start_state]))#-epsilon*(1-mdp.discountFactor)/2 print(samples, QupperMBAE[policy2Index][start_state]-QlowerMBAE[policy1Index][start_state]) ff.write('\n') else: print(samples) print(QupperMBAE[:,start_state], QlowerMBAE[:,start_state]) # np.savetxt(ff, (policies[policy1Index]), fmt="%d") counts[policy1Index] += 1 counts[policy2Index] += 1 polList = [policy1Index, policy2Index] if(use_ddv): ## Caclulate V for all states for pnum in polList: policiesfddv = policies[pnum] # print "Getting DDV values" for st in list(discovered_states): ac = policiesfddv[st] #### Compute del del V deltadeltaV[st] = CalculateDelDelV( st, ac, mdp, N_s_a_sprime, QupperMBAE[pnum], QlowerMBAE[pnum], None, None, start_state, P_s_a_sprime, P_tilda[pnum], P_lower_tilda[pnum], R_s_a, epsilon, delta, converge_iterations, epsilon_convergence, policiesfddv ) # print deltadeltaV cs = np.argmax(deltadeltaV) ca = policiesfddv[cs] # print deltadeltaV, cs, ca # print deltadeltaV, policy1, policy2 # print "Found max state for DDV: ",cs,ca # time.sleep(0.1) ss, rr = mdp.simulate(cs, ca) print("Policy is ", policiesfddv) print("Sampling ", cs, ca) time.sleep(0.1) samples = samples + 1 discovered_states.add(ss) R_s_a[cs][ca] = (rr + R_s_a[cs][ca]*N_s_a[cs][ca])/(N_s_a[cs][ca]+1) N_s_a[cs][ca] += 1 N_s_a_sprime[cs][ca][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[cs][ca][s2] = (float)(N_s_a_sprime[cs][ca][s2])/N_s_a[cs][ca] elif(mc): deltaW = np.zeros(mdp.numStates) mu = np.zeros(mdp.numStates) D = np.zeros(mdp.numStates) mu[start_state] = 1 for t in range(H): D = D + (mdp.discountFactor**t) * mu mu = prob_step(mu, P_s_a_sprime, policy1) for st in range(mdp.numStates): #transition uncertainty for given s, pi(s) deltaW[st] = delW(st, policy1[st], delta, N_s_a_sprime[st][policy1[st]], mdp.numStates, False) st = np.argmax(deltaW * D) ac = policy1[st] ss, rr = mdp.simulate(st, ac) samples += 1 R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] deltaW = np.zeros(mdp.numStates) mu = np.zeros(mdp.numStates) D = np.zeros(mdp.numStates) mu[start_state] = 1 for t in range(H): D = D + (mdp.discountFactor**t) * mu mu = prob_step(mu, P_s_a_sprime, policy2) for st in range(mdp.numStates): #transition uncertainty for given s, pi(s) deltaW[st] = delW(st, policy2[st], delta, N_s_a_sprime[st][policy2[st]], mdp.numStates, False) st = np.argmax(deltaW * D) ac = policy2[st] ss, rr = mdp.simulate(st, ac) samples += 1 R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] else: while h<H: act = policy1[state] # print "------>",current_state, current_action ss, rr = mdp.simulate(state, act) samples+=1 R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] state = ss h+=1 h=0 state = start_state # print "episode : " while h<H: # print state, act = policy2[state] ss, rr = mdp.simulate(state, act) samples+=1 R_s_a[state][act] = (rr + R_s_a[state][act]*N_s_a[state][act])/(N_s_a[state][act]+1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)(N_s_a_sprime[state][act][s2])/N_s_a[state][act] state = ss h+=1 if (samples%1000)<1000: if(QupperMBAE[policy2Index][start_state]-QlowerMBAE[policy1Index][start_state]-epsilon*(1-mdp.discountFactor)/2<0): print(Qupper[policy2Index][start_state],Qstar[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2) print("Epsilon condition reached at ",samples, " samples") print(policy1) return(policy1) else: # print QupperMBAE[policy2Index][start_state],QstarMBAE[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2 pass # print "ends here" ff.close() return policy1
def RoundRobin(mdp, start_state=0, epsilon=4, randomseed=None, delta=0.1): global MAX_ITERATION_LIMIT, c if (randomseed is not None): np.random.seed(randomseed) iteration = 0 it = 0 initial_iterations = 1 * mdp.numStates * mdp.numActions rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.random.random([mdp.numStates]) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Qlower = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.random.random([mdp.numStates, mdp.numActions]) final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) states_to_sample = range(mdp.numStates) colliding_values = np.zeros((mdp.numStates)) is_converged = 0 ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 for s2 in range(mdp.numStates): P[state][act][s_prime] = (float)( N_s_a_sprime[state][act] [s_prime]) / sampled_frequency_s_a[state][act] ### Calculating V, Q estimates thus far for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][act] = firstterm + lower_secondterm - thirdterm VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Qupper = np.copy(QupperMBAE) Qlower = np.copy(QlowerMBAE) if (verbose == 0): outp = open(mdp.filename + '-rr' + str(randomseed) + '.txt', 'wb') ff = open(mdp.filename + '-rr-samples.txt', 'w+') while iteration < MAX_ITERATION_LIMIT: # print "Sampling state ", max_collision_state[0] # print colliding_values for state1 in range(mdp.numStates): # print "Sampling ", state1, "for this round" for act1 in range(mdp.numActions): iteration += 1 sampled_frequency_s_a[state1][act1] += 1 # Simluate the MDP with this state,action and update counts #### TRying 10 continuous simulations for t in range(1): s_prime, r = mdp.simulate(state1, act1) rewards_s_a_sprime[state1][act1][s_prime] += r R_s_a[state][act] = ( r + R_s_a[state][act] * sampled_frequency_s_a[state][act] ) / (sampled_frequency_s_a[state][act] + 1) N_s_a_sprime[state1][act1][s_prime] += 1 for s2 in range(mdp.numStates): P[state1][act][s_prime] = (float)( N_s_a_sprime[state1][act] [s_prime]) / sampled_frequency_s_a[state1][act] ## Calculating Q and V values for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP( i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vlower, False) Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence( Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) # Calculations for QupperMBAE and QlowerMBAE #### This involved a two for-loop and iterating convergence for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum( rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (iteration**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][ act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) count = 0 # print iteration, (QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])/epsilon, sampled_frequency_s_a if (iteration % 100 == 0): for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) if (verbose == 0): outp.write(str(iteration)) outp.write('\t') outp.write( str(QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) ) #-epsilon*(1-mdp.discountFactor)/2 # print(str(QupperMBAE[start_state][acList[1]]-QlowerMBAE[start_state][acList[0]])) # print(iteration, QupperMBAE[start_state]) # outp.write(str(evaluatePolicy(mdp, final_policy, start_state))) print(str(evaluatePolicy(mdp, final_policy, start_state))) outp.write('\n') else: print(iteration, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]])) # print iteration, (Qupper[start_state][acList[1]]-Qlower[start_state][acList[0]]) np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') # print iteration #### Check epsilon condition for only starting state acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) if (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]] < epsilon * (1 - mdp.discountFactor) / 2 and iteration > 50): print( QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]], "<", epsilon * (1 - mdp.discountFactor) / 2) # if(count==mdp.numStates): acList = bestTwoActions(mdp, start_state, QlowerMBAE, QupperMBAE, Qstar) a = open('final' + mdp.filename + '-rr.txt', 'a+') a.write(str(iteration) + '\n') a.close() print("Setting final_policy of ", start_state, " to", acList[0]) final_policy[start_state] = acList[0] print("Iterations taken : ", iteration) print("Returning the policy :", final_policy) for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] return final_policy for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, QlowerMBAE, QupperMBAE, Qstar)[0] return final_policy
def LUCBBound(mdp, start_state=0, epsilon=4, delta=0.1, fileprint=1): global MAX_ITERATION_LIMIT, c iteration = 0 it = 0 H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) / (1 - mdp.discountFactor)) initial_iterations = 1 * mdp.numStates * mdp.numActions rewards_s_a_sprime = np.zeros( (mdp.numStates, mdp.numActions, mdp.numStates)) R_s_a = np.zeros((mdp.numStates, mdp.numActions)) sampled_frequency_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) P_lower_tilda = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) VlowerMBAE = np.zeros((mdp.numStates)) Vlower = np.zeros((mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((mdp.numStates)) Vupper = mdp.Vmax * np.ones((mdp.numStates)) Qlower = np.zeros((mdp.numStates, mdp.numActions)) VupperMBAE = mdp.Vmax * np.ones((mdp.numStates)) QlowerMBAE = np.zeros((mdp.numStates, mdp.numActions)) Qstar = (mdp.Vmax / 2) * np.ones((mdp.numStates, mdp.numActions)) QupperMBAE = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) Qupper = mdp.Vmax * np.ones((mdp.numStates, mdp.numActions)) final_policy = (-1) * np.ones((mdp.numStates), dtype=np.int) states_to_sample = range(mdp.numStates) colliding_values = np.zeros((mdp.numStates)) converge_iterations = 10000 epsilon_convergence = 1e-4 is_converged = 0 print "Vmax", mdp.Vmax ### Initial sampling for all state action pairs while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it += 1 s_prime, r = mdp.simulate(state, act) rewards_s_a_sprime[state][act][s_prime] += r R_s_a[state][act] = (r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) sampled_frequency_s_a[state][act] += 1 N_s_a_sprime[state][act][s_prime] += 1 ### Calculating V, Q estimates thus far MBAE for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state] [act]) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) # if(state==start_state and abs(VupperMBAE[state]-QupperMBAEmax)<epsilon_convergence): # VupperMBAE[state] = QupperMBAEmax # print "Stopping with ", internal, "initial internal iterations" # is_converged = 1 # break VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): print "Stopping with ", internal, "initial internal iterations" break if internal == converge_iterations: print "Used all iterations" print "Initial estimate of QupperMBAE found! Now sampling" sys.stdout = open(mdp.filename + '-lucbbound.txt', 'w+') ff = open(mdp.filename + '-lucbbound-samples.txt', 'w+') h = 0 state1 = start_state while iteration < MAX_ITERATION_LIMIT: max_collision_state = [ sorted(states_to_sample, key=lambda x: colliding_values[x], reverse=True)[0] ] # print "Sampling state ", max_collision_state[0] # print colliding_values # print "Sampling ", state1, "for this round" if (h % H == 0): state1 = start_state h = 0 else: state1 = nextstate actionsList = bestTwoActions(mdp, state1, Qlower, Qupper, Qstar) a = np.random.choice(actionsList) iteration += 1 sampled_frequency_s_a[state1][a] += 1 for t in range(1): s_prime, r = mdp.simulate(state1, a) nextstate = s_prime rewards_s_a_sprime[state1][a][s_prime] += r R_s_a[state][act] = ( r + R_s_a[state][act] * sampled_frequency_s_a[state][act]) / ( sampled_frequency_s_a[state][act] + 1) N_s_a_sprime[state1][a][s_prime] += 1 ## Calculating Q and V values for i in range(mdp.numStates): for j in range(mdp.numActions): if (sampled_frequency_s_a[i][j] > 0): P_tilda[i][j] = UpperP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) P_lower_tilda[i][j] = LowerP(i, j, delta, N_s_a_sprime[i][j], mdp.numStates, Vupper, False) Qupper, Vupper = iteratedConvergence(Qupper, R_s_a, P_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower, Vlower = iteratedConvergence(Qlower, R_s_a, P_lower_tilda, mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) # Calculations for QupperMBAE and QlowerMBAE #### This involved a two for-loop and iterating convergence for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[start_state]) for state in range(mdp.numStates): for act in range(mdp.numActions): # Calculations for QupperMBAE and QlowerMBAE # Calculations for QupperMBAE and QlowerMBAE firstterm = np.sum(rewards_s_a_sprime[state][act] ) / sampled_frequency_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #secondterm = mdp.discountFactor*sum(VupperMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar * (N_s_a_sprime[state][act] / sampled_frequency_s_a[state][act])) #lower_secondterm = mdp.discountFactor*sum(VlowerMBAE[ss]*N_s_a_sprime[state][act][ss]/sampled_frequency_s_a[state][act] for ss in range(mdp.numStates)) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (iteration**2) * mdp.numStates * mdp.numActions) - math.log(delta)) / sampled_frequency_s_a[state][act]) #QupperMBAE[state][act] = (float)(sum(rewards_s_a_sprime[state][act][ss] for ss in range(mdp.numStates))/sampled_frequency_s_a[state][act]) + secondterm + thirdterm QupperMBAE[state][act] = firstterm + secondterm + thirdterm QlowerMBAE[state][ act] = firstterm + lower_secondterm - thirdterm Qstar[state][act] = firstterm + star_secondterm # Calculation for Vstar # t = (float)N_s_a_sprime[state][act][stateprime]/sampled_frequency_s_a[state][act] # val = t*(rewards_s_a[state][act][stateprime]+mdp.discountFactor*Vstar[stateprime]) VupperMBAE[state] = np.amax(QupperMBAE[state]) VlowerMBAE[state] = np.amax(QlowerMBAE[state]) Vstar[state] = np.amax(Qstar[state]) if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[start_state]) <= epsilon_convergence): # print "Stopping with ", internal, "iterations" break count = 0 if (iteration % 10000 == 0): print iteration, (QupperMBAE[start_state][acList[1]] - QlowerMBAE[start_state][acList[0]]) / epsilon np.savetxt(ff, sampled_frequency_s_a, delimiter=',') ff.write('\n') # print QupperMBAE # print iteration #### Check epsilon condition for all the states # for st in range(mdp.numStates): # acList = bestTwoActions(mdp, st, Qstar, QupperMBAE, Qstar) # # print "Comparing ",QupperMBAE[st][acList[1]], QlowerMBAE[st][acList[0]] # if(QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]<=epsilon): # # print "Setting action ", acList[0], "for state ", st # final_policy[st]=acList[0] # count+=1 ##### Updating the list of coliliding states states_to_sample = [] for st in range(mdp.numStates): acList = bestTwoActions(mdp, st, Qlower, Qupper, Qstar) # colliding_values[st] = QupperMBAE[st][acList[1]]-QlowerMBAE[st][acList[0]]-epsilon ##### Changing stopping condition to epsilon*(1-gamma)/2 colliding_values[st] = Qupper[st][acList[1]] - Qlower[st][ acList[0]] - epsilon * (1 - mdp.discountFactor) / 2 # print colliding_values[st] if (colliding_values[st] > 0): ### this state is still colliding, add to sample states states_to_sample.append(st) #### Check epsilon condition for only starting state if (not (start_state in states_to_sample)): # if(count==mdp.numStates): acList = bestTwoActions(mdp, start_state, Qlower, Qupper, Qstar) print "Setting final_policy of ", start_state, " to", acList[0] final_policy[start_state] = acList[0] print "Iterations taken : ", iteration print "Returning the policy :", final_policy for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper, Qstar)[0] return final_policy h += 1 for i in range(mdp.numStates): if (final_policy[i] == -1): final_policy[i] = bestTwoActions(mdp, i, Qlower, Qupper, Qstar)[0] return final_policy
def markovchain(mdp, start_state=0, epsilon=4, randomseed=None, algo="episodic", delta=0.1, bounds="MBAE"): if (randomseed is not None): np.random.seed(randomseed) policies = np.array(getPolicies(mdp.numStates, mdp.numActions)) numPolicies = len(policies) print numPolicies H = int((math.log(mdp.Vmax) + math.log(6.0 / epsilon)) / (1 - mdp.discountFactor)) print "Chosen value of H is : ", H ## Initializations it = 0 samples = 0 initial_iterations = 1 * mdp.numStates * mdp.numActions R_s_a = np.zeros((mdp.numStates, mdp.numActions)) N_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates), dtype=np.int) N_s_a = np.zeros((mdp.numStates, mdp.numActions), dtype=np.int) P_s_a_sprime = np.zeros((mdp.numStates, mdp.numActions, mdp.numStates)) Qupper = mdp.Vmax * np.ones((numPolicies, mdp.numStates)) QupperMBAE = mdp.Vmax * np.ones((numPolicies, mdp.numStates)) Qlower = np.zeros((numPolicies, mdp.numStates)) Qstar = (mdp.Vmax / 2) * np.ones((numPolicies, mdp.numStates)) QstarMBAE = (mdp.Vmax / 2) * np.ones((numPolicies, mdp.numStates)) QlowerMBAE = np.zeros((numPolicies, mdp.numStates)) P_tilda = np.zeros((numPolicies, mdp.numStates, mdp.numStates)) P_lower_tilda = np.zeros((numPolicies, mdp.numStates, mdp.numStates)) VlowerMBAE = np.zeros((numPolicies, mdp.numStates)) VupperMBAE = mdp.Vmax * np.ones((numPolicies, mdp.numStates)) Vstar = (mdp.Vmax / 2) * np.ones((numPolicies, mdp.numStates)) discovered_states = set([start_state]) deltadeltaV = np.zeros((mdp.numStates)) state_dist = np.zeros((mdp.numStates)) state_dist[start_state] = 1 while it < initial_iterations: for state in range(mdp.numStates): for act in range(mdp.numActions): it = it + 1 ss, rr = mdp.simulate(state, act) R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act] ) / (N_s_a[state][act] + 1) N_s_a[state][act] = N_s_a[state][act] + 1 N_s_a_sprime[state][act][ss] = N_s_a_sprime[state][act][ss] + 1 for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)( N_s_a_sprime[state][act][s2]) / N_s_a[state][act] samples += initial_iterations if (algo == "use_ddv"): ff = open(mdp.filename + '-markovddv' + str(randomseed) + '.txt', 'wb') elif (algo == "episodic"): ff = open(mdp.filename + '-markoveps' + str(randomseed) + '.txt', 'wb') elif (algo == "uniform"): ff = open(mdp.filename + '-markovuni' + str(randomseed) + '.txt', 'wb') elif (algo == "greedyMBAE"): ff = open(mdp.filename + '-markovMBAE' + str(randomseed) + '.txt', 'wb') elif (algo == "greedyMBIE"): ff = open(mdp.filename + '-markovMBIE' + str(randomseed) + '.txt', 'wb') elif (algo == "mybest"): ff = open(mdp.filename + '-markovbest' + str(randomseed) + '.txt', 'wb') while samples < MAX_ITERATION_LIMIT: p = 0 current_policy = fixedPolicy for i in range(mdp.numStates): # print "For state ", i, " doing UpperP" if (N_s_a[i][current_policy[i]] > 0): P_tilda[p][i] = UpperP(i, current_policy[i], delta, N_s_a_sprime[i][current_policy[i]], mdp.numStates, Qupper[p], False) P_lower_tilda[p][i] = LowerP( i, current_policy[i], delta, N_s_a_sprime[i][current_policy[i]], mdp.numStates, Qlower[p], False) Qupper[p] = itConvergencePolicy(Qupper[p], getRewards(R_s_a, current_policy), P_tilda[p], mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qlower[p] = itConvergencePolicy(Qlower[p], getRewards(R_s_a, current_policy), P_lower_tilda[p], mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) Qstar[p] = itConvergencePolicy(Qstar[p], getRewards(R_s_a, current_policy), getProb(P_s_a_sprime, current_policy), mdp.discountFactor, epsilon, converge_iterations, epsilon_convergence) for internal in range(converge_iterations): oldQlowerMBAE = np.copy(QlowerMBAE[p][start_state]) for state in range(mdp.numStates): act = current_policy[state] firstterm = R_s_a[state][act] secondterm = mdp.discountFactor * np.sum( VupperMBAE[p] * (P_s_a_sprime[state][act])) lower_secondterm = mdp.discountFactor * np.sum( VlowerMBAE[p] * (P_s_a_sprime[state][act])) star_secondterm = mdp.discountFactor * np.sum( Vstar[p] * (P_s_a_sprime[state][act])) thirdterm = mdp.Vmax * math.sqrt( (math.log(c * (samples**2) * mdp.numStates * 1) - math.log(delta)) / N_s_a[state][act]) QupperMBAE[p][state] = firstterm + secondterm + thirdterm QlowerMBAE[p][state] = firstterm + lower_secondterm - thirdterm QstarMBAE[p][state] = firstterm + star_secondterm VupperMBAE[p][state] = QupperMBAE[p][state] VlowerMBAE[p][state] = QlowerMBAE[p][state] Vstar[p][state] = QstarMBAE[p][state] if (np.linalg.norm(oldQlowerMBAE - QlowerMBAE[p][start_state]) <= epsilon_convergence): break policy1Index = 0 h = 0 policy1 = fixedPolicy state = start_state # print "samples", samples if (samples % 1000) < 100: if (verbose == 0): ff.write(str(samples)) ff.write('\t') if (plot_vstar): ff.write(str(Vstar[policy1Index][start_state])) else: ff.write( str(QupperMBAE[policy1Index][start_state] - QlowerMBAE[policy1Index][start_state]) ) #-epsilon*(1-mdp.discountFactor)/2 print samples, QupperMBAE[policy1Index][ start_state] - QlowerMBAE[policy1Index][start_state] ff.write('\n') else: print samples print QupperMBAE[:, start_state], QlowerMBAE[:, start_state] polList = [policy1Index] if (algo == "use_ddv"): ## Caclulate V for all states for pnum in polList: policiesfddv = fixedPolicy # print "Getting DDV values" for st in list(discovered_states): ac = policiesfddv[st] #### Compute del del V deltadeltaV[st] = CalculateDelDelV( st, ac, mdp, N_s_a_sprime, QupperMBAE[pnum], QlowerMBAE[pnum], None, None, start_state, P_s_a_sprime, P_tilda[pnum], P_lower_tilda[pnum], R_s_a, epsilon, delta, converge_iterations, epsilon_convergence, policiesfddv) # print deltadeltaV cs = np.argmax(deltadeltaV) ca = policiesfddv[cs] # print deltadeltaV, cs, ca # print deltadeltaV, policy1, policy2 # print "Found max state for DDV: ",cs,ca # time.sleep(0.1) ss, rr = mdp.simulate(cs, ca) # print "Policy is ", policiesfddv # print "Sampling ", cs, ca time.sleep(0.1) samples = samples + 1 discovered_states.add(ss) R_s_a[cs][ca] = (rr + R_s_a[cs][ca] * N_s_a[cs][ca]) / ( N_s_a[cs][ca] + 1) N_s_a[cs][ca] += 1 N_s_a_sprime[cs][ca][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[cs][ca][s2] = (float)( N_s_a_sprime[cs][ca][s2]) / N_s_a[cs][ca] elif (algo == "episodic"): while h < H: act = policy1[state] # print "------>",current_state, current_action ss, rr = mdp.simulate(state, act) # print "Sampling ", state, act samples += 1 R_s_a[state][act] = (rr + R_s_a[state][act] * N_s_a[state][act] ) / (N_s_a[state][act] + 1) N_s_a[state][act] += 1 N_s_a_sprime[state][act][ss] += 1 # P_s_a_sprime = np.copy(N_s_a_sprime) for s2 in range(mdp.numStates): P_s_a_sprime[state][act][s2] = (float)( N_s_a_sprime[state][act][s2]) / N_s_a[state][act] state = ss h += 1 elif (algo == "uniform"): for st in range(mdp.numStates): ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / ( N_s_a[st][ac] + 1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)( N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac] elif (algo == "greedyMBAE"): st = max(range(mdp.numStates), key=lambda x: VupperMBAE[0][x] - VlowerMBAE[0][x]) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / ( N_s_a[st][ac] + 1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)( N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac] elif (algo == "greedyMBIE"): st = max(range(mdp.numStates), key=lambda x: Qupper[0][x] - Qlower[0][x]) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / ( N_s_a[st][ac] + 1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)( N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac] elif (algo == "mybest"): if (samples % 10000 < 50): state_dist = np.zeros((mdp.numStates)) state_dist[start_state] = 1 N = getSampleCount(state_dist, N_s_a_sprime, QupperMBAE[policy1Index], QlowerMBAE[policy1Index], QstarMBAE[policy1Index]) # print N for i in range(N): # print state_dist, samples, P_s_a_sprime # import pdb; pdb.set_trace() st = np.random.choice(np.arange(mdp.numStates), p=state_dist) ac = fixedPolicy[st] ss, rr = mdp.simulate(st, ac) # print "Sampling ", st, ac samples += 1 R_s_a[st][ac] = (rr + R_s_a[st][ac] * N_s_a[st][ac]) / ( N_s_a[st][ac] + 1) N_s_a[st][ac] += 1 N_s_a_sprime[st][ac][ss] += 1 for s2 in range(mdp.numStates): P_s_a_sprime[st][ac][s2] = (float)( N_s_a_sprime[st][ac][s2]) / N_s_a[st][ac] state_dist = prob_step(state_dist, P_s_a_sprime, fixedPolicy) if (samples % 1000) < 100: if (QupperMBAE[policy1Index][start_state] - QlowerMBAE[policy1Index][start_state] - epsilon * (1 - mdp.discountFactor) / 2 < 0): print Qupper[policy1Index][start_state], Qstar[policy1Index][ start_state], epsilon * (1 - mdp.discountFactor) / 2 print "Epsilon condition reached at ", samples, " samples" return fixedPolicy else: # print QupperMBAE[policy2Index][start_state],QstarMBAE[policy1Index][start_state],epsilon*(1-mdp.discountFactor)/2 pass # print "ends here" ff.close() return policy1