def main(MazeEnv): env = MazeEnv() env.render() plt.pause(2) n_epochs = 10000 robot_loc = [] steps = 0 rewards = 0.0 for i in range(n_epochs): steps += 1 # next_action = np.random.randint(4,size = 1) next_action, robot_loc = env.expert(robot_loc) state_img, reward, done, _ = env.step(next_action) rewards += reward env.render() print('Step = %d, rewards = %.1f, reward = %.1f, done = %d' % (steps, rewards, reward, done), end='\r') if done: print('\n') if done: steps = 0 rewards = 0.0 plt.pause(2) env.reset()
import numpy as np from MazeEnv import * import copy from collections import defaultdict world_size = np.array([5, 5]) env = MazeEnv(world_size=world_size, \ gold_pos=np.array([[1, 2]]), \ bad_pos=np.array([[3, 4]]), \ max_ite=50) env.reset() DISCOUNT = 0.9 epsilon = 0.1 Q = np.zeros((world_size[0] * world_size[1], 4)) Q_old = copy.deepcopy(Q) PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int) # 0,1,2,3分别代表4个动作 # Return = defaultdict(list) ite = 0 while True: # episode循环 ite += 1 Q_old = copy.deepcopy(Q) s = env.reset() s_a_his = [] r_his = [] while True: # step循环 #a = PI[s] #随机策略 a = PI[s] if (np.random.rand() > epsilon) else env.random_action() s_, r, d = env.step(a) s_a_his.append(env.encode_s_a(s, a))
import numpy as np from MazeEnv import * world_size = np.array([5, 5]) env = MazeEnv(world_size=world_size, \ gold_pos=np.array([[1,2]]), \ bad_pos=np.array([[3,4]]), \ max_ite=100) env.reset() # 四个动作各1/4 ACTION_PROB = 0.25 DISCOUNT = 0.9 A1 = -1 * np.eye(world_size[0] * world_size[1]) A2 = np.zeros((world_size[0] * world_size[1], world_size[0] * world_size[1])) b = np.zeros(world_size[0] * world_size[1]) for s in env.feasible_states: for a in env.feasible_actions: s_, r, d = env.step_state(s, a) A2[s, s_] += ACTION_PROB * DISCOUNT b[s] += ACTION_PROB * r V = np.linalg.solve(A1 + A2, -b).reshape(world_size) print(V)