コード例 #1
0
Q = np.zeros((world_size[0] * world_size[1], 4))
Q_old = copy.deepcopy(Q)
PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int)  # 0,1,2,3分别代表4个动作
#
Return = defaultdict(list)

ite = 0
while True:  # episode循环
    ite += 1
    Q_old = copy.deepcopy(Q)
    s = env.reset()
    s_a_his = []
    r_his = []
    while True:  # step循环
        #a = PI[s] #随机策略
        a = PI[s] if (np.random.rand() > epsilon) else env.random_action()
        s_, r, d = env.step(a)
        s_a_his.append(env.encode_s_a(s, a))
        r_his.append(r)
        s = s_
        if d:
            break
    G = 0
    for i in range(len(r_his) - 1, -1, -1):
        G = r_his[i] + DISCOUNT * G
        s_a = s_a_his[i]
        if s_a not in s_a_his[:i]:
            Return[s_a].append(G)
            s, a = env.decode_s_a(s_a)
            Q[s, a] = np.mean(np.array(Return[s_a]))
            PI[s] = np.argmax(Q[s, :])
Q = np.zeros((world_size[0] * world_size[1], 4))
Q_old = copy.deepcopy(Q)
Q1 = np.zeros((world_size[0] * world_size[1], 4))
Q1_old = copy.deepcopy(Q1)
Q2 = np.zeros((world_size[0] * world_size[1], 4))
Q2_old = copy.deepcopy(Q2)
V = np.zeros((world_size[0]*world_size[1]))
V_old = copy.deepcopy(V)

ite = 0
while True:  # episode循环
    ite += 1
    Q_old = copy.deepcopy(Q)
    s = env.reset()
    while True:  # step循环
        a = np.argmax(Q1[s, :]+Q2[s, :]) if (np.random.rand() > epsilon) else env.random_action()
        s_, r, d = env.step(a)
        if np.random.rand() >= 0.5:
            Q1[s,a] += alpha * (r + DISCOUNT*Q2[s_, np.argmax(Q1[s_, :])] - Q1[s, a])
        else:
            Q2[s,a] += alpha * (r + DISCOUNT*Q1[s_, np.argmax(Q2[s_, :])] - Q2[s, a])
        s = s_
        if d:
            break
    Q = (Q1 + Q2)/2.0
    print(np.max(np.abs(Q - Q_old)))
    if np.max(np.abs(Q - Q_old)) < 0.0001 and ite >= 10000:
        break

print(ite)
print(Q)
コード例 #3
0
V = np.zeros((world_size[0] * world_size[1]))
V_old = copy.deepcopy(V)
PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int)  #0,1,2,3分别代表4个动作
#
Return = defaultdict(list)

ite = 0
while True:  #episode循环
    ite += 1
    V_old = copy.deepcopy(V)
    s = env.reset()
    s_his = []
    r_his = []
    while True:  #step循环
        #随机策略
        a = env.random_action()
        s_, r, d = env.step(a)
        s_his.append(s)
        r_his.append(r)
        s = s_
        if d:
            break
    G = 0
    for i in range(len(r_his) - 1, -1, -1):
        G = r_his[i] + DISCOUNT * G
        s = s_his[i]
        if s not in s_his[:i]:
            Return[s].append(G)
            V[s] = np.mean(np.array(Return[s]))
    if np.max(np.abs(V - V_old)) < 0.0001 and ite >= 100:
        break
コード例 #4
0
DISCOUNT = 0.9
epsilon = 0.1
alpha = 0.01
Q = np.zeros((world_size[0] * world_size[1], 4))
Q_old = copy.deepcopy(Q)
V = np.zeros((world_size[0] * world_size[1]))
V_old = copy.deepcopy(V)

ite = 0
while True:  # episode循环
    ite += 1
    Q_old = copy.deepcopy(Q)
    s = env.reset()
    while True:  # step循环
        a = np.argmax(
            Q[s, :]) if (np.random.rand() > epsilon) else env.random_action()
        s_, r, d = env.step(a)
        Q[s, a] += alpha * (r + DISCOUNT * np.max(Q[s_, :]) - Q[s, a])
        s = s_
        if d:
            break
    print(np.max(np.abs(Q - Q_old)))
    if np.max(np.abs(Q - Q_old)) < 0.0001 and ite >= 10000:
        break

print(ite)
print(Q)

for i in range(world_size[0] * world_size[1]):
    V[i] = np.max(Q[i, :])