def problemA(): print("PROBLEM A...") episodes = 10000 arr = np.zeros(episodes) G = Gridworld() G.gamma = 0.9 for e in range(episodes): # number of episodes loop G.timeStep = 0 # print("episode %d" % (e+1)) while (not G.isEnd): # print(G.currentState) G.step(G.action) arr[e] = G.reward G.reset() opt_disc_returns = np.amax(arr) opt_episode = np.argmax(arr) + 1 mean = np.mean(arr) variance = np.var(arr) std_dev = np.std(arr) min = np.amin(arr) print("Highest observed discounted returns is %f achieved in" " episode number %d" % (opt_disc_returns, opt_episode)) print("The mean of discounted returns is %f, variance is %f" " and standard deviation is %f" % (mean, variance, std_dev)) print("Max is %f and min is %f" % (opt_disc_returns, min)) return arr
def problemC(): print("PROBLEM C...") policy = np.array([ 3, 3, 3, 3, 1, 0, 3, 3, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 3, 3, 4 ]) episodes = 10000 arr = np.zeros(episodes) G = Gridworld() G.gamma = 0.9 for e in range(episodes): G.timestep = 0 # print("episode %d" % (e+1)) while (not G.isEnd): # print(G.currentState) G.step(G.stoch_action(policy[G.state])) arr[e] = G.reward G.reset() # arr[e] = disc_returns opt_disc_returns = np.amax(arr) opt_episode = np.argmax(arr) + 1 mean = np.mean(arr) variance = np.var(arr) std_dev = np.std(arr) min = np.amin(arr) print("Highest observed discounted returns is %f achieved in" " episode number %d" % (opt_disc_returns, opt_episode)) print("The mean of discounted returns is %f, variance is %f" " and standard deviation is %f" % (mean, variance, std_dev)) print("Max is %f and min is %f" % (opt_disc_returns, min)) # print(np.argmin(arr) + 1) return arr
def problemE(): print("PROBLEM E...") episodes = 10000 count = 0 G = Gridworld(startState=19) G.gamma = 0.9 for e in range(episodes): G.timeStep = 0 while ((G.timeStep < 11) and (not G.isEnd)): G.step(G.action) if G.state == 22: count = count + 1 G.reset() print("The empirical probability of S19 = 21 given S8 = 18 is %f" % (count / episodes))
def problemA(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ time_list = [] reward_list = [] env = Gridworld() env.gamma = 0.9 episode = 0 while episode <= 10000: episode += 1 print('Episode {}'.format(episode)) step = 0 totalReward = 0 reached = False while True: step += 1 act = env.action state, reward, isEnd = Gridworld.step(env, act) # reward_list.append(reward) totalReward += reward if isEnd: reached = True print('Steps take: {}\tTotal reward: {:.4f}'.format( step, totalReward)) break if not reached: episode -= 1 continue Gridworld.reset(env) reward_list.append(totalReward) print('finished') reward_array = np.array(reward_list) mean = reward_array.mean() std = reward_array.std() max = reward_array.max() min = reward_array.min() print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format( mean, std, max, min)) print('Num of reward: {}'.format(len(reward_list))) with open('./resultA.json', 'w') as file: json.dump(reward_list, file)
def problemE(): env = Gridworld(startState=18) env.gamma = 0.9 episode = 0 hit = 0 total_try = 100000 while episode < total_try: episode += 1 env.timeStep = 8 while env.timeStep < 19: act = env.action state, reward, isEnd = Gridworld.step(env, act) if isEnd: break if env.currentState == 21: hit += 1 print('P is {}'.format(hit / total_try))
def problemB(): """ Run the optimal policy that you found for 10,000 episodes. Repor the mean, standard deviation, maximum, and minimum of the observed discounted returns """ # if on the upper edge, move right; if on right edge, move down; # else, move right or down env = Gridworld() env.gamma = 0.9 episode = 0 reward_list = [] # obstacles = [12, 17] # waterStates = [6, 18, 22] # upperBounds = [0, 1, 2, 3, 4] # rightBounds = [4, 9, 14, 19, 24] while episode < 10000: episode += 1 print('Episode {}'.format(episode)) step = 0 totalReward = 0 reached = False while step < 10000: step += 1 if env.currentState in env.rightBounds: act = 2 # Move down elif env.currentState in env.upperBounds: act = 3 # Move right else: if random.random() < 0.5: act = 3 else: act = 2 # secure = False # while not secure: # if act == 2: # nextState = env.currentState + 5 # if nextState in env.waterStates or nextState in env.obstacles: # act = 3 # else: # secure = True # else: # nextState = env.currentState + 1 # if nextState in env.waterStates or nextState in env.obstacles: # act = 2 # else: # secure = True state, reward, isEnd = Gridworld.step(env, act) totalReward += reward if isEnd: reached = True print('Steps take: {}\tTotal reward: {:.4f}'.format( step, totalReward)) break if not reached: episode -= 1 continue Gridworld.reset(env) reward_list.append(totalReward) print('finished') reward_array = np.array(reward_list) mean = reward_array.mean() std = reward_array.std() max = reward_array.max() min = reward_array.min() print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format( mean, std, max, min)) print('Num of reward: {}'.format(len(reward_list))) with open('./resultB.json', 'w') as file: json.dump(reward_list, file)