コード例 #1
0
def problemA(num_iters):
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    agent = Agent()
    discounted_returns = []
    gridworld = Gridworld()

    for i in range(num_iters):
        reward = 0
        time = 0
        while True:
            action = agent.act()
            gridworld.step(action)
            reward += gridworld.reward * (gridworld.gamma**time)
            if gridworld.isEnd:
                break
            time += 1
        discounted_returns.append(reward)
        gridworld.reset()

    print('Mean = ', st.mean(discounted_returns))
    print('Standard deviation = ', st.stdev(discounted_returns))
    print('Max = ', max(discounted_returns))
    print('Min = ', min(discounted_returns))

    return discounted_returns
コード例 #2
0
def problemC(num_iters):
    """
    Find an optimal policy (you may do this any way you choose,
    including by reasoning through the problem yourself). Report the optimal
    policy here. Comment on whether it is unique
    """
    agent = Agent()
    discounted_returns = []
    gridworld = Gridworld()
    print("acting optimally")

    for i in range(num_iters):
        reward = 0
        time = 0
        while True:
            action = agent.actOptimally(gridworld.state)
            gridworld.step(action)
            reward += gridworld.reward * (gridworld.gamma**time)
            if gridworld.isEnd:
                break
            time += 1
        discounted_returns.append(reward)
        gridworld.reset()

    print('Mean = ', st.mean(discounted_returns))
    print('Standard deviation = ', st.stdev(discounted_returns))
    print('Max = ', max(discounted_returns))
    print('Min = ', min(discounted_returns))

    return discounted_returns
コード例 #3
0
def problemA():
    print("PROBLEM A...")
    episodes = 10000
    arr = np.zeros(episodes)
    G = Gridworld()
    G.gamma = 0.9
    for e in range(episodes):  # number of episodes loop
        G.timeStep = 0
        # print("episode %d" % (e+1))
        while (not G.isEnd):
            # print(G.currentState)
            G.step(G.action)
        arr[e] = G.reward
        G.reset()

    opt_disc_returns = np.amax(arr)
    opt_episode = np.argmax(arr) + 1
    mean = np.mean(arr)
    variance = np.var(arr)
    std_dev = np.std(arr)
    min = np.amin(arr)
    print("Highest observed discounted returns is %f achieved in"
          " episode number %d" % (opt_disc_returns, opt_episode))
    print("The mean of discounted returns is %f, variance is %f"
          " and standard deviation is %f" % (mean, variance, std_dev))
    print("Max is %f and min is %f" % (opt_disc_returns, min))
    return arr
コード例 #4
0
def problemE():
    """
    Using simulations,  empirically estimate the probability that S_19=21
    given that S_8=18 (the state above the goal) when running the
    uniform random policy.  Describe how you estimated this quantity (there
    is not a typo in this problem, nor an oversight)
    NOTE: State 18 is state 19 in this gridworld implementation and state 21 is 22.
    """
    print("\nProblem E")
    env = Gridworld()
    success = 0
    N = 100000
    for trial in range(N):
        env.reset()
        env._state = 19
        step = 0

        while not env.isEnd:
            env.step(np.random.choice(range(4)))
            step += 1
            if step == 11:
                break
        if env._state == 22:
            success += 1
    p = success / N
    eps = np.sqrt((1 / (2 * N)) * np.log(2 / 0.05))  # Hoeffding's inequality
    print(
        "Pr(S_19=s_22 | S_8=s_18)={0:.5f} empirically and is in ({1:.5f},{2:.5f}) with 95% confidence using Hoeffding's inequality".format(
            p, p - eps, p + eps))
コード例 #5
0
def problemC():
    print("PROBLEM C...")
    policy = np.array([
        3, 3, 3, 3, 1, 0, 3, 3, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 3, 3,
        4
    ])
    episodes = 10000
    arr = np.zeros(episodes)
    G = Gridworld()
    G.gamma = 0.9
    for e in range(episodes):
        G.timestep = 0
        #        print("episode %d" % (e+1))
        while (not G.isEnd):
            #  print(G.currentState)
            G.step(G.stoch_action(policy[G.state]))
        arr[e] = G.reward
        G.reset()


#        arr[e] = disc_returns
    opt_disc_returns = np.amax(arr)
    opt_episode = np.argmax(arr) + 1
    mean = np.mean(arr)
    variance = np.var(arr)
    std_dev = np.std(arr)
    min = np.amin(arr)
    print("Highest observed discounted returns is %f achieved in"
          " episode number %d" % (opt_disc_returns, opt_episode))
    print("The mean of discounted returns is %f, variance is %f"
          " and standard deviation is %f" % (mean, variance, std_dev))
    print("Max is %f and min is %f" % (opt_disc_returns, min))
    #    print(np.argmin(arr) + 1)
    return arr
コード例 #6
0
def problemE():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    # setting random seed for reproducibility
    print("Problem E")
    start_time = time.time()

    env = Gridworld(startState=19)
    num_episodes = 1000000
    count_s19_22_given_s8_19 = 0
    for episode in range(num_episodes):
        # print (episode)
        time_step = 0
        while (not env.isEnd) and time_step < 12:
            state = env.state
            if time_step == 11 and state == 22:
                count_s19_22_given_s8_19 += 1
            action = np.random.choice([0, 1, 2, 3])
            env.step(action)
            time_step += 1
            # print (t)
        env.reset()
    print(count_s19_22_given_s8_19)
    Pr_s19_22_given_s8_19 = (count_s19_22_given_s8_19 * 1.0) / num_episodes

    end_time = time.time()
    print("Estimate of Pr(S_8=19 | S_19 = 22) = ", Pr_s19_22_given_s8_19)
    print("Execution time = ", end_time - start_time)
    """
コード例 #7
0
def problemA():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    # setting random seed for reproducibility
    print("Problem A")

    env = Gridworld()

    discounted_returns = []
    for episode in range(10000):
        # print (episode)
        discounted_return = 0.0
        while not env.isEnd:
            state = env.state
            action = np.random.choice([0, 1, 2, 3])
            # print (state, action)
            actual_action, new_state, reward = env.step(action)
            # print (actual_action, new_state, reward)
            discounted_return += reward
            # print (t)
        env.reset()
        # print(time_step)
        discounted_returns.append(discounted_return)

    print("Mean ", np.mean(discounted_returns))
    print("Std Dev ", np.std(discounted_returns))
    print("Max ", np.max(discounted_returns))
    print("Min ", np.min(discounted_returns))

    return discounted_returns
    """
コード例 #8
0
ファイル: evaluate.py プロジェクト: kshitimehta/RL-Project
class Evaluate:
    def __init__(self):
        self.environment = Gridworld()
        self.policy = TabularSoftmax(25,4)
        self._G = []
    
    @property
    def batchReturn(self)->str:
        return self._G
    
    def __call__(self, theta:np.array, numEpisodes:int):
#	    print("Evaluating Gridworld")
#        self._G = [] #reset G at every call
	    # environment = Gridworld()
        # policy = TabularSoftmax(25,4)
        self.policy.parameters = theta
#        print("numEpisodes",numEpisodes)
        
        Count = 200
        
        for episode in range(numEpisodes):
	        
            self.environment.reset()
            G_episode = 0
            
            counter = 0
            ctr=0
            while not self.environment.isEnd:

                if(counter>=Count):
                    G_episode = -50
                    break
                state = self.environment.state
                action = self.policy.samplAction(state)
                _, reward, _ = self.environment.step(action)
                
                G_episode += (self.environment.gamma**ctr)*reward
#                G_episode += reward
                
                counter+=1
                ctr+=1
	        # self.returns.append(Gi)
            self._G.append(G_episode)
#            if (episode % 50 == 0):
#                print(G_episode)

	    # print("Mean Return ", np.mean(G))
        return np.mean(self._G)
    
    def reset(self):
        self.environment = Gridworld()
        self.policy = TabularSoftmax(25,4)
        self._G = []
コード例 #9
0
def problemE():
    print("PROBLEM E...")
    episodes = 10000
    count = 0
    G = Gridworld(startState=19)
    G.gamma = 0.9
    for e in range(episodes):
        G.timeStep = 0
        while ((G.timeStep < 11) and (not G.isEnd)):
            G.step(G.action)
        if G.state == 22:
            count = count + 1
        G.reset()
    print("The empirical probability of S19 = 21 given S8 = 18 is %f" %
          (count / episodes))
コード例 #10
0
def problemA():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    time_list = []
    reward_list = []
    env = Gridworld()
    env.gamma = 0.9

    episode = 0
    while episode <= 10000:
        episode += 1
        print('Episode {}'.format(episode))
        step = 0
        totalReward = 0
        reached = False
        while True:
            step += 1
            act = env.action
            state, reward, isEnd = Gridworld.step(env, act)
            # reward_list.append(reward)
            totalReward += reward
            if isEnd:
                reached = True
                print('Steps take: {}\tTotal reward: {:.4f}'.format(
                    step, totalReward))
                break
        if not reached:
            episode -= 1
            continue
        Gridworld.reset(env)
        reward_list.append(totalReward)
    print('finished')

    reward_array = np.array(reward_list)
    mean = reward_array.mean()
    std = reward_array.std()
    max = reward_array.max()
    min = reward_array.min()

    print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format(
        mean, std, max, min))
    print('Num of reward: {}'.format(len(reward_list)))
    with open('./resultA.json', 'w') as file:
        json.dump(reward_list, file)
コード例 #11
0
ファイル: homework2.py プロジェクト: subendhu19/rl-hw2
def run_gridworld_episode(p):
    environment = Gridworld()
    policy = TabularSoftmax(25, 4)
    policy.parameters = p
    is_end = False
    discounted_return = 0
    t = 0
    while not is_end:
        action = policy.samplAction(environment.state)
        new_state, reward, is_end = environment.step(action)
        discounted_return += (environment.gamma**t) * reward
        t += 1
        if t > 200:
            discounted_return = -50
            break
    environment.reset()
    return discounted_return
コード例 #12
0
def runEnvironment(getAction, numeps=10000):
    returns = np.zeros(numeps)

    grid = Gridworld()
    for ep in range(numeps):
        grid.reset()
        step = 0
        g = 0
        while not grid.isEnd:
            s, r, e = grid.step(getAction(grid.state))
            g += (grid.gamma ** step) * r
            step += 1
        returns[ep] = g

    print("Average: {}\nStandard Deviation: {}\nMin: {}\nMax: {}".format( \
        np.mean(returns), np.std(returns), np.min(returns), np.max(returns)))
    return returns
コード例 #13
0
def runEnvironment_gridworld(policy, numeps=10000):
    returns = np.zeros(numeps)

    grid = Gridworld()
    for ep in range(numeps):
        grid.reset()
        step = 0
        g = 0
        while not grid.isEnd:
            action = policy.samplAction(grid.state)
            s, r, e = grid.step(action)
            g += (grid.gamma**step) * r
            step += 1
            if step > 200:
                g = -50
                break
        returns[ep] = g
    return returns
コード例 #14
0
def problemA():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed
    discounted returns.
    """
    grid_world = Gridworld()
    rewards = []
    for episod in range(10000):
        is_end = False
        grid_world.reset()
        r = 0

        while ~is_end:
            action = np.random.randint(4)
            r_, is_end = grid_world.step(action)
            r += r_
        rewards.append(r)
        print(episod, r, is_end)
    rewards = np.array(rewards)
    print(rewards.mean(), rewards.std(), rewards.max(), rewards.min())
コード例 #15
0
def problemB():
    """
    Run the optimal policy that you found for 10,000 episodes. Repor the 
    mean, standard deviation, maximum, and minimum of the observed 
    discounted returns
    """
    print("Problem B")

    optimal_policy_actions = [
        1, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, -1, 2, 2, 0, 3, -1, 1, 2, 0, 3, 1,
        1, -1
    ]

    env = Gridworld()

    discounted_returns = []
    for t in range(10000):
        # print (t)
        discounted_return = 0.0
        while not env.isEnd:
            state = env.state
            action = optimal_policy_actions[state]
            # print (state, action)
            actual_action, new_state, reward = env.step(action)
            # print (actual_action, new_state, reward)
            discounted_return += reward
        discounted_returns.append(discounted_return)
        env.reset()

    print("Mean ", np.mean(discounted_returns))
    print("Std Dev ", np.std(discounted_returns))
    print("Max ", np.max(discounted_returns))
    print("Min ", np.min(discounted_returns))

    return discounted_returns
    # plt.hist(sorted(discounted_returns), density = True, cumulative=True, label='CDF',
    #      histtype='step', alpha=0.8, color='k')
    # plt.show()
    """
コード例 #16
0
 def __call__(self, parameters: np.array, numEpisodes: int):
     # print("Evaluating Gridworld")
     G = []
     policy = TabularSoftmax(25, 4)
     policy.parameters = parameters
     env = Gridworld()
     for ep in range(numEpisodes):
         # print("Episode ", ep)
         env.reset()
         Gi = 0
         timeStep = 0
         while not env.isEnd:
             state = env.state
             action = policy.samplAction(state)
             _, next_state, reward = env.step(action)
             Gi += reward
             timeStep += 1
             if timeStep == 200:
                 Gi += -50
                 break
         G.append(Gi)
         self.curTrialReturns.append(Gi)
     print("Mean Return ", np.mean(G))
     return np.mean(G)
コード例 #17
0
def problemB():
    """
    Run the optimal policy that you found for 10,000 episodes. Repor the 
    mean, standard deviation, maximum, and minimum of the observed 
    discounted returns
    """
    # if on the upper edge, move right; if on right edge, move down;
    # else, move right or down
    env = Gridworld()
    env.gamma = 0.9
    episode = 0
    reward_list = []
    # obstacles = [12, 17]
    # waterStates = [6, 18, 22]
    # upperBounds = [0, 1, 2, 3, 4]
    # rightBounds = [4, 9, 14, 19, 24]

    while episode < 10000:
        episode += 1
        print('Episode {}'.format(episode))
        step = 0
        totalReward = 0
        reached = False
        while step < 10000:
            step += 1
            if env.currentState in env.rightBounds:
                act = 2  # Move down
            elif env.currentState in env.upperBounds:
                act = 3  # Move right
            else:
                if random.random() < 0.5:
                    act = 3
                else:
                    act = 2
            # secure = False
            # while not secure:
            #     if act == 2:
            #         nextState = env.currentState + 5
            #         if nextState in env.waterStates or nextState in env.obstacles:
            #             act = 3
            #         else:
            #             secure = True
            #     else:
            #         nextState = env.currentState + 1
            #         if nextState in env.waterStates or nextState in env.obstacles:
            #             act = 2
            #         else:
            #             secure = True

            state, reward, isEnd = Gridworld.step(env, act)
            totalReward += reward
            if isEnd:
                reached = True
                print('Steps take: {}\tTotal reward: {:.4f}'.format(
                    step, totalReward))
                break
        if not reached:
            episode -= 1
            continue
        Gridworld.reset(env)
        reward_list.append(totalReward)
    print('finished')

    reward_array = np.array(reward_list)
    mean = reward_array.mean()
    std = reward_array.std()
    max = reward_array.max()
    min = reward_array.min()

    print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format(
        mean, std, max, min))
    print('Num of reward: {}'.format(len(reward_list)))
    with open('./resultB.json', 'w') as file:
        json.dump(reward_list, file)