def problemA(num_iters): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ agent = Agent() discounted_returns = [] gridworld = Gridworld() for i in range(num_iters): reward = 0 time = 0 while True: action = agent.act() gridworld.step(action) reward += gridworld.reward * (gridworld.gamma**time) if gridworld.isEnd: break time += 1 discounted_returns.append(reward) gridworld.reset() print('Mean = ', st.mean(discounted_returns)) print('Standard deviation = ', st.stdev(discounted_returns)) print('Max = ', max(discounted_returns)) print('Min = ', min(discounted_returns)) return discounted_returns
def problemC(num_iters): """ Find an optimal policy (you may do this any way you choose, including by reasoning through the problem yourself). Report the optimal policy here. Comment on whether it is unique """ agent = Agent() discounted_returns = [] gridworld = Gridworld() print("acting optimally") for i in range(num_iters): reward = 0 time = 0 while True: action = agent.actOptimally(gridworld.state) gridworld.step(action) reward += gridworld.reward * (gridworld.gamma**time) if gridworld.isEnd: break time += 1 discounted_returns.append(reward) gridworld.reset() print('Mean = ', st.mean(discounted_returns)) print('Standard deviation = ', st.stdev(discounted_returns)) print('Max = ', max(discounted_returns)) print('Min = ', min(discounted_returns)) return discounted_returns
def problemA(): print("PROBLEM A...") episodes = 10000 arr = np.zeros(episodes) G = Gridworld() G.gamma = 0.9 for e in range(episodes): # number of episodes loop G.timeStep = 0 # print("episode %d" % (e+1)) while (not G.isEnd): # print(G.currentState) G.step(G.action) arr[e] = G.reward G.reset() opt_disc_returns = np.amax(arr) opt_episode = np.argmax(arr) + 1 mean = np.mean(arr) variance = np.var(arr) std_dev = np.std(arr) min = np.amin(arr) print("Highest observed discounted returns is %f achieved in" " episode number %d" % (opt_disc_returns, opt_episode)) print("The mean of discounted returns is %f, variance is %f" " and standard deviation is %f" % (mean, variance, std_dev)) print("Max is %f and min is %f" % (opt_disc_returns, min)) return arr
def problemE(): """ Using simulations, empirically estimate the probability that S_19=21 given that S_8=18 (the state above the goal) when running the uniform random policy. Describe how you estimated this quantity (there is not a typo in this problem, nor an oversight) NOTE: State 18 is state 19 in this gridworld implementation and state 21 is 22. """ print("\nProblem E") env = Gridworld() success = 0 N = 100000 for trial in range(N): env.reset() env._state = 19 step = 0 while not env.isEnd: env.step(np.random.choice(range(4))) step += 1 if step == 11: break if env._state == 22: success += 1 p = success / N eps = np.sqrt((1 / (2 * N)) * np.log(2 / 0.05)) # Hoeffding's inequality print( "Pr(S_19=s_22 | S_8=s_18)={0:.5f} empirically and is in ({1:.5f},{2:.5f}) with 95% confidence using Hoeffding's inequality".format( p, p - eps, p + eps))
def problemC(): print("PROBLEM C...") policy = np.array([ 3, 3, 3, 3, 1, 0, 3, 3, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 3, 3, 4 ]) episodes = 10000 arr = np.zeros(episodes) G = Gridworld() G.gamma = 0.9 for e in range(episodes): G.timestep = 0 # print("episode %d" % (e+1)) while (not G.isEnd): # print(G.currentState) G.step(G.stoch_action(policy[G.state])) arr[e] = G.reward G.reset() # arr[e] = disc_returns opt_disc_returns = np.amax(arr) opt_episode = np.argmax(arr) + 1 mean = np.mean(arr) variance = np.var(arr) std_dev = np.std(arr) min = np.amin(arr) print("Highest observed discounted returns is %f achieved in" " episode number %d" % (opt_disc_returns, opt_episode)) print("The mean of discounted returns is %f, variance is %f" " and standard deviation is %f" % (mean, variance, std_dev)) print("Max is %f and min is %f" % (opt_disc_returns, min)) # print(np.argmin(arr) + 1) return arr
def problemE(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ # setting random seed for reproducibility print("Problem E") start_time = time.time() env = Gridworld(startState=19) num_episodes = 1000000 count_s19_22_given_s8_19 = 0 for episode in range(num_episodes): # print (episode) time_step = 0 while (not env.isEnd) and time_step < 12: state = env.state if time_step == 11 and state == 22: count_s19_22_given_s8_19 += 1 action = np.random.choice([0, 1, 2, 3]) env.step(action) time_step += 1 # print (t) env.reset() print(count_s19_22_given_s8_19) Pr_s19_22_given_s8_19 = (count_s19_22_given_s8_19 * 1.0) / num_episodes end_time = time.time() print("Estimate of Pr(S_8=19 | S_19 = 22) = ", Pr_s19_22_given_s8_19) print("Execution time = ", end_time - start_time) """
def problemE(): print("PROBLEM E...") episodes = 10000 count = 0 G = Gridworld(startState=19) G.gamma = 0.9 for e in range(episodes): G.timeStep = 0 while ((G.timeStep < 11) and (not G.isEnd)): G.step(G.action) if G.state == 22: count = count + 1 G.reset() print("The empirical probability of S19 = 21 given S8 = 18 is %f" % (count / episodes))
def problemA(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ # setting random seed for reproducibility print("Problem A") env = Gridworld() discounted_returns = [] for episode in range(10000): # print (episode) discounted_return = 0.0 while not env.isEnd: state = env.state action = np.random.choice([0, 1, 2, 3]) # print (state, action) actual_action, new_state, reward = env.step(action) # print (actual_action, new_state, reward) discounted_return += reward # print (t) env.reset() # print(time_step) discounted_returns.append(discounted_return) print("Mean ", np.mean(discounted_returns)) print("Std Dev ", np.std(discounted_returns)) print("Max ", np.max(discounted_returns)) print("Min ", np.min(discounted_returns)) return discounted_returns """
def problemE(num_iters): agent = Agent() gridworld = Gridworld() count = 0 for i in range(num_iters): gridworld.state = 19 #defining the state to be above end for i in range(8, 19): time = i #not used anywhere, just for clarity purpose action = agent.act( ) #this will be action a_18 in the last iteration gridworld.step(action) if gridworld.state == 22: count += 1 print('P(S_19 = 22| S_8=19) = ', count / num_iters)
class Evaluate: def __init__(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = [] @property def batchReturn(self)->str: return self._G def __call__(self, theta:np.array, numEpisodes:int): # print("Evaluating Gridworld") # self._G = [] #reset G at every call # environment = Gridworld() # policy = TabularSoftmax(25,4) self.policy.parameters = theta # print("numEpisodes",numEpisodes) Count = 200 for episode in range(numEpisodes): self.environment.reset() G_episode = 0 counter = 0 ctr=0 while not self.environment.isEnd: if(counter>=Count): G_episode = -50 break state = self.environment.state action = self.policy.samplAction(state) _, reward, _ = self.environment.step(action) G_episode += (self.environment.gamma**ctr)*reward # G_episode += reward counter+=1 ctr+=1 # self.returns.append(Gi) self._G.append(G_episode) # if (episode % 50 == 0): # print(G_episode) # print("Mean Return ", np.mean(G)) return np.mean(self._G) def reset(self): self.environment = Gridworld() self.policy = TabularSoftmax(25,4) self._G = []
def problemA(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ time_list = [] reward_list = [] env = Gridworld() env.gamma = 0.9 episode = 0 while episode <= 10000: episode += 1 print('Episode {}'.format(episode)) step = 0 totalReward = 0 reached = False while True: step += 1 act = env.action state, reward, isEnd = Gridworld.step(env, act) # reward_list.append(reward) totalReward += reward if isEnd: reached = True print('Steps take: {}\tTotal reward: {:.4f}'.format( step, totalReward)) break if not reached: episode -= 1 continue Gridworld.reset(env) reward_list.append(totalReward) print('finished') reward_array = np.array(reward_list) mean = reward_array.mean() std = reward_array.std() max = reward_array.max() min = reward_array.min() print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format( mean, std, max, min)) print('Num of reward: {}'.format(len(reward_list))) with open('./resultA.json', 'w') as file: json.dump(reward_list, file)
def run_gridworld_episode(p): environment = Gridworld() policy = TabularSoftmax(25, 4) policy.parameters = p is_end = False discounted_return = 0 t = 0 while not is_end: action = policy.samplAction(environment.state) new_state, reward, is_end = environment.step(action) discounted_return += (environment.gamma**t) * reward t += 1 if t > 200: discounted_return = -50 break environment.reset() return discounted_return
def runEnvironment(getAction, numeps=10000): returns = np.zeros(numeps) grid = Gridworld() for ep in range(numeps): grid.reset() step = 0 g = 0 while not grid.isEnd: s, r, e = grid.step(getAction(grid.state)) g += (grid.gamma ** step) * r step += 1 returns[ep] = g print("Average: {}\nStandard Deviation: {}\nMin: {}\nMax: {}".format( \ np.mean(returns), np.std(returns), np.min(returns), np.max(returns))) return returns
def problemE(): env = Gridworld(startState=18) env.gamma = 0.9 episode = 0 hit = 0 total_try = 100000 while episode < total_try: episode += 1 env.timeStep = 8 while env.timeStep < 19: act = env.action state, reward, isEnd = Gridworld.step(env, act) if isEnd: break if env.currentState == 21: hit += 1 print('P is {}'.format(hit / total_try))
def runEnvironment_gridworld(policy, numeps=10000): returns = np.zeros(numeps) grid = Gridworld() for ep in range(numeps): grid.reset() step = 0 g = 0 while not grid.isEnd: action = policy.samplAction(grid.state) s, r, e = grid.step(action) g += (grid.gamma**step) * r step += 1 if step > 200: g = -50 break returns[ep] = g return returns
def problemA(): """ Have the agent uniformly randomly select actions. Run 10,000 episodes. Report the mean, standard deviation, maximum, and minimum of the observed discounted returns. """ grid_world = Gridworld() rewards = [] for episod in range(10000): is_end = False grid_world.reset() r = 0 while ~is_end: action = np.random.randint(4) r_, is_end = grid_world.step(action) r += r_ rewards.append(r) print(episod, r, is_end) rewards = np.array(rewards) print(rewards.mean(), rewards.std(), rewards.max(), rewards.min())
def problemB(): """ Run the optimal policy that you found for 10,000 episodes. Repor the mean, standard deviation, maximum, and minimum of the observed discounted returns """ print("Problem B") optimal_policy_actions = [ 1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, -1, 2, 2, 0, 3, -1, 1, 2, 0, 3, 1, 1, -1 ] env = Gridworld() discounted_returns = [] for t in range(10000): # print (t) discounted_return = 0.0 while not env.isEnd: state = env.state action = optimal_policy_actions[state] # print (state, action) actual_action, new_state, reward = env.step(action) # print (actual_action, new_state, reward) discounted_return += reward discounted_returns.append(discounted_return) env.reset() print("Mean ", np.mean(discounted_returns)) print("Std Dev ", np.std(discounted_returns)) print("Max ", np.max(discounted_returns)) print("Min ", np.min(discounted_returns)) return discounted_returns # plt.hist(sorted(discounted_returns), density = True, cumulative=True, label='CDF', # histtype='step', alpha=0.8, color='k') # plt.show() """
def __call__(self, parameters: np.array, numEpisodes: int): # print("Evaluating Gridworld") G = [] policy = TabularSoftmax(25, 4) policy.parameters = parameters env = Gridworld() for ep in range(numEpisodes): # print("Episode ", ep) env.reset() Gi = 0 timeStep = 0 while not env.isEnd: state = env.state action = policy.samplAction(state) _, next_state, reward = env.step(action) Gi += reward timeStep += 1 if timeStep == 200: Gi += -50 break G.append(Gi) self.curTrialReturns.append(Gi) print("Mean Return ", np.mean(G)) return np.mean(G)
def problemB(): """ Run the optimal policy that you found for 10,000 episodes. Repor the mean, standard deviation, maximum, and minimum of the observed discounted returns """ # if on the upper edge, move right; if on right edge, move down; # else, move right or down env = Gridworld() env.gamma = 0.9 episode = 0 reward_list = [] # obstacles = [12, 17] # waterStates = [6, 18, 22] # upperBounds = [0, 1, 2, 3, 4] # rightBounds = [4, 9, 14, 19, 24] while episode < 10000: episode += 1 print('Episode {}'.format(episode)) step = 0 totalReward = 0 reached = False while step < 10000: step += 1 if env.currentState in env.rightBounds: act = 2 # Move down elif env.currentState in env.upperBounds: act = 3 # Move right else: if random.random() < 0.5: act = 3 else: act = 2 # secure = False # while not secure: # if act == 2: # nextState = env.currentState + 5 # if nextState in env.waterStates or nextState in env.obstacles: # act = 3 # else: # secure = True # else: # nextState = env.currentState + 1 # if nextState in env.waterStates or nextState in env.obstacles: # act = 2 # else: # secure = True state, reward, isEnd = Gridworld.step(env, act) totalReward += reward if isEnd: reached = True print('Steps take: {}\tTotal reward: {:.4f}'.format( step, totalReward)) break if not reached: episode -= 1 continue Gridworld.reset(env) reward_list.append(totalReward) print('finished') reward_array = np.array(reward_list) mean = reward_array.mean() std = reward_array.std() max = reward_array.max() min = reward_array.min() print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format( mean, std, max, min)) print('Num of reward: {}'.format(len(reward_list))) with open('./resultB.json', 'w') as file: json.dump(reward_list, file)