Q = np.zeros((world_size[0] * world_size[1], 4)) Q_old = copy.deepcopy(Q) PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int) # 0,1,2,3分别代表4个动作 # Return = defaultdict(list) ite = 0 while True: # episode循环 ite += 1 Q_old = copy.deepcopy(Q) s = env.reset() s_a_his = [] r_his = [] while True: # step循环 #a = PI[s] #随机策略 a = PI[s] if (np.random.rand() > epsilon) else env.random_action() s_, r, d = env.step(a) s_a_his.append(env.encode_s_a(s, a)) r_his.append(r) s = s_ if d: break G = 0 for i in range(len(r_his) - 1, -1, -1): G = r_his[i] + DISCOUNT * G s_a = s_a_his[i] if s_a not in s_a_his[:i]: Return[s_a].append(G) s, a = env.decode_s_a(s_a) Q[s, a] = np.mean(np.array(Return[s_a])) PI[s] = np.argmax(Q[s, :])
Q = np.zeros((world_size[0] * world_size[1], 4)) Q_old = copy.deepcopy(Q) Q1 = np.zeros((world_size[0] * world_size[1], 4)) Q1_old = copy.deepcopy(Q1) Q2 = np.zeros((world_size[0] * world_size[1], 4)) Q2_old = copy.deepcopy(Q2) V = np.zeros((world_size[0]*world_size[1])) V_old = copy.deepcopy(V) ite = 0 while True: # episode循环 ite += 1 Q_old = copy.deepcopy(Q) s = env.reset() while True: # step循环 a = np.argmax(Q1[s, :]+Q2[s, :]) if (np.random.rand() > epsilon) else env.random_action() s_, r, d = env.step(a) if np.random.rand() >= 0.5: Q1[s,a] += alpha * (r + DISCOUNT*Q2[s_, np.argmax(Q1[s_, :])] - Q1[s, a]) else: Q2[s,a] += alpha * (r + DISCOUNT*Q1[s_, np.argmax(Q2[s_, :])] - Q2[s, a]) s = s_ if d: break Q = (Q1 + Q2)/2.0 print(np.max(np.abs(Q - Q_old))) if np.max(np.abs(Q - Q_old)) < 0.0001 and ite >= 10000: break print(ite) print(Q)
V = np.zeros((world_size[0] * world_size[1])) V_old = copy.deepcopy(V) PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int) #0,1,2,3分别代表4个动作 # Return = defaultdict(list) ite = 0 while True: #episode循环 ite += 1 V_old = copy.deepcopy(V) s = env.reset() s_his = [] r_his = [] while True: #step循环 #随机策略 a = env.random_action() s_, r, d = env.step(a) s_his.append(s) r_his.append(r) s = s_ if d: break G = 0 for i in range(len(r_his) - 1, -1, -1): G = r_his[i] + DISCOUNT * G s = s_his[i] if s not in s_his[:i]: Return[s].append(G) V[s] = np.mean(np.array(Return[s])) if np.max(np.abs(V - V_old)) < 0.0001 and ite >= 100: break
DISCOUNT = 0.9 epsilon = 0.1 alpha = 0.01 Q = np.zeros((world_size[0] * world_size[1], 4)) Q_old = copy.deepcopy(Q) V = np.zeros((world_size[0] * world_size[1])) V_old = copy.deepcopy(V) ite = 0 while True: # episode循环 ite += 1 Q_old = copy.deepcopy(Q) s = env.reset() while True: # step循环 a = np.argmax( Q[s, :]) if (np.random.rand() > epsilon) else env.random_action() s_, r, d = env.step(a) Q[s, a] += alpha * (r + DISCOUNT * np.max(Q[s_, :]) - Q[s, a]) s = s_ if d: break print(np.max(np.abs(Q - Q_old))) if np.max(np.abs(Q - Q_old)) < 0.0001 and ite >= 10000: break print(ite) print(Q) for i in range(world_size[0] * world_size[1]): V[i] = np.max(Q[i, :])