Pl[5,0,5]=0.1 Pl[6,0,6]=1 Pl[0,1,0]=1 Pl[1,1,1]=0 Pl[1,1,0]=1 Pl[2,1,1]=1 Pl[3,1,2]=1 Pl[4,1,3]=1 Pl[5,1,4]=1 Pl[6,1,5]=1 Rl = np.zeros((7,2)) Rl[[0,6],:]=1 absorv = np.zeros((7,1)) absorv[[0,6]]=1 fmdp = RL.finiteMDP(7,2,0.9,Pl,Rl,absorv) J,traj = fmdp.runPolicy(10000,3,poltype = "exploration") #choose this value data = np.load("Q1.npz") Qr = fmdp.traces2Q(traj) if np.sqrt(sum(sum((data['Q1']-Qr)**2)))<1: print("Aproximação de Q dentro do previsto. OK\n") else: print("Aproximação de Q fora do previsto. FAILED\n") J,traj = fmdp.runPolicy(3,3,poltype = "exploitation", polpar = Qr) if np.sqrt(sum(sum((data['traj2']-traj)**2)))<1: print("Trajectória óptima. OK\n") else: print("Trajectória não óptima. FAILED\n")
Pl[:, 1, :] = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]]) Pl[:, 2, :] = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]) Pl[:, 3, :] = np.array([[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1]]) Rl = np.array([[-1, -1, -1, 0], [-1, 0, -1, -1], [-1, -1, -1, 0], [-1, 0, -1, 0]]) absorv = np.zeros((4, 1)) absorv[-1] = 1 fmdp = RL.finiteMDP(4, 4, 0.9, Pl, Rl, absorv) J, traj = fmdp.runPolicy(3000, 0, poltype="exploration") data = np.load("Q2.npz") Qr = fmdp.traces2Q(traj) result = np.sqrt(sum(sum((data['Q1'] - Qr)**2))) if result < 1: print("Aproximação de Q dentro do previsto. OK\n") else: print("Aproximação de Q fora do previsto. FAILED\n") J, traj = fmdp.runPolicy(3, 1, poltype="exploitation", polpar=Qr) result = np.sqrt(sum(sum((data['traj2'] - traj)**2))) if result < 1: print("Trajectória óptima. OK\n") else: