def problemA(): print("PROBLEM A...") episodes = 10000 arr = np.zeros(episodes) G = Gridworld() G.gamma = 0.9 for e in range(episodes): # number of episodes loop G.timeStep = 0 # print("episode %d" % (e+1)) while (not G.isEnd): # print(G.currentState) G.step(G.action) arr[e] = G.reward G.reset() opt_disc_returns = np.amax(arr) opt_episode = np.argmax(arr) + 1 mean = np.mean(arr) variance = np.var(arr) std_dev = np.std(arr) min = np.amin(arr) print("Highest observed discounted returns is %f achieved in" " episode number %d" % (opt_disc_returns, opt_episode)) print("The mean of discounted returns is %f, variance is %f" " and standard deviation is %f" % (mean, variance, std_dev)) print("Max is %f and min is %f" % (opt_disc_returns, min)) return arr
def problemE(): print("PROBLEM E...") episodes = 10000 count = 0 G = Gridworld(startState=19) G.gamma = 0.9 for e in range(episodes): G.timeStep = 0 while ((G.timeStep < 11) and (not G.isEnd)): G.step(G.action) if G.state == 22: count = count + 1 G.reset() print("The empirical probability of S19 = 21 given S8 = 18 is %f" % (count / episodes))
def problemE(): env = Gridworld(startState=18) env.gamma = 0.9 episode = 0 hit = 0 total_try = 100000 while episode < total_try: episode += 1 env.timeStep = 8 while env.timeStep < 19: act = env.action state, reward, isEnd = Gridworld.step(env, act) if isEnd: break if env.currentState == 21: hit += 1 print('P is {}'.format(hit / total_try))