コード例 #1
0
def problemA():
    print("PROBLEM A...")
    episodes = 10000
    arr = np.zeros(episodes)
    G = Gridworld()
    G.gamma = 0.9
    for e in range(episodes):  # number of episodes loop
        G.timeStep = 0
        # print("episode %d" % (e+1))
        while (not G.isEnd):
            # print(G.currentState)
            G.step(G.action)
        arr[e] = G.reward
        G.reset()

    opt_disc_returns = np.amax(arr)
    opt_episode = np.argmax(arr) + 1
    mean = np.mean(arr)
    variance = np.var(arr)
    std_dev = np.std(arr)
    min = np.amin(arr)
    print("Highest observed discounted returns is %f achieved in"
          " episode number %d" % (opt_disc_returns, opt_episode))
    print("The mean of discounted returns is %f, variance is %f"
          " and standard deviation is %f" % (mean, variance, std_dev))
    print("Max is %f and min is %f" % (opt_disc_returns, min))
    return arr
コード例 #2
0
def problemC():
    print("PROBLEM C...")
    policy = np.array([
        3, 3, 3, 3, 1, 0, 3, 3, 3, 1, 0, 2, 4, 3, 1, 0, 2, 4, 3, 1, 0, 2, 3, 3,
        4
    ])
    episodes = 10000
    arr = np.zeros(episodes)
    G = Gridworld()
    G.gamma = 0.9
    for e in range(episodes):
        G.timestep = 0
        #        print("episode %d" % (e+1))
        while (not G.isEnd):
            #  print(G.currentState)
            G.step(G.stoch_action(policy[G.state]))
        arr[e] = G.reward
        G.reset()


#        arr[e] = disc_returns
    opt_disc_returns = np.amax(arr)
    opt_episode = np.argmax(arr) + 1
    mean = np.mean(arr)
    variance = np.var(arr)
    std_dev = np.std(arr)
    min = np.amin(arr)
    print("Highest observed discounted returns is %f achieved in"
          " episode number %d" % (opt_disc_returns, opt_episode))
    print("The mean of discounted returns is %f, variance is %f"
          " and standard deviation is %f" % (mean, variance, std_dev))
    print("Max is %f and min is %f" % (opt_disc_returns, min))
    #    print(np.argmin(arr) + 1)
    return arr
コード例 #3
0
def problemE():
    print("PROBLEM E...")
    episodes = 10000
    count = 0
    G = Gridworld(startState=19)
    G.gamma = 0.9
    for e in range(episodes):
        G.timeStep = 0
        while ((G.timeStep < 11) and (not G.isEnd)):
            G.step(G.action)
        if G.state == 22:
            count = count + 1
        G.reset()
    print("The empirical probability of S19 = 21 given S8 = 18 is %f" %
          (count / episodes))
コード例 #4
0
def problemA():
    """
    Have the agent uniformly randomly select actions. Run 10,000 episodes.
    Report the mean, standard deviation, maximum, and minimum of the observed 
    discounted returns.
    """
    time_list = []
    reward_list = []
    env = Gridworld()
    env.gamma = 0.9

    episode = 0
    while episode <= 10000:
        episode += 1
        print('Episode {}'.format(episode))
        step = 0
        totalReward = 0
        reached = False
        while True:
            step += 1
            act = env.action
            state, reward, isEnd = Gridworld.step(env, act)
            # reward_list.append(reward)
            totalReward += reward
            if isEnd:
                reached = True
                print('Steps take: {}\tTotal reward: {:.4f}'.format(
                    step, totalReward))
                break
        if not reached:
            episode -= 1
            continue
        Gridworld.reset(env)
        reward_list.append(totalReward)
    print('finished')

    reward_array = np.array(reward_list)
    mean = reward_array.mean()
    std = reward_array.std()
    max = reward_array.max()
    min = reward_array.min()

    print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format(
        mean, std, max, min))
    print('Num of reward: {}'.format(len(reward_list)))
    with open('./resultA.json', 'w') as file:
        json.dump(reward_list, file)
コード例 #5
0
def problemE():
    env = Gridworld(startState=18)
    env.gamma = 0.9
    episode = 0
    hit = 0
    total_try = 100000
    while episode < total_try:
        episode += 1
        env.timeStep = 8
        while env.timeStep < 19:
            act = env.action
            state, reward, isEnd = Gridworld.step(env, act)
            if isEnd:
                break
        if env.currentState == 21:
            hit += 1
    print('P is {}'.format(hit / total_try))
コード例 #6
0
def problemB():
    """
    Run the optimal policy that you found for 10,000 episodes. Repor the 
    mean, standard deviation, maximum, and minimum of the observed 
    discounted returns
    """
    # if on the upper edge, move right; if on right edge, move down;
    # else, move right or down
    env = Gridworld()
    env.gamma = 0.9
    episode = 0
    reward_list = []
    # obstacles = [12, 17]
    # waterStates = [6, 18, 22]
    # upperBounds = [0, 1, 2, 3, 4]
    # rightBounds = [4, 9, 14, 19, 24]

    while episode < 10000:
        episode += 1
        print('Episode {}'.format(episode))
        step = 0
        totalReward = 0
        reached = False
        while step < 10000:
            step += 1
            if env.currentState in env.rightBounds:
                act = 2  # Move down
            elif env.currentState in env.upperBounds:
                act = 3  # Move right
            else:
                if random.random() < 0.5:
                    act = 3
                else:
                    act = 2
            # secure = False
            # while not secure:
            #     if act == 2:
            #         nextState = env.currentState + 5
            #         if nextState in env.waterStates or nextState in env.obstacles:
            #             act = 3
            #         else:
            #             secure = True
            #     else:
            #         nextState = env.currentState + 1
            #         if nextState in env.waterStates or nextState in env.obstacles:
            #             act = 2
            #         else:
            #             secure = True

            state, reward, isEnd = Gridworld.step(env, act)
            totalReward += reward
            if isEnd:
                reached = True
                print('Steps take: {}\tTotal reward: {:.4f}'.format(
                    step, totalReward))
                break
        if not reached:
            episode -= 1
            continue
        Gridworld.reset(env)
        reward_list.append(totalReward)
    print('finished')

    reward_array = np.array(reward_list)
    mean = reward_array.mean()
    std = reward_array.std()
    max = reward_array.max()
    min = reward_array.min()

    print('Mean: {:.2f}\tSTD: {:.2f}\tMax: {:.2f}\tMin: {:.2f}'.format(
        mean, std, max, min))
    print('Num of reward: {}'.format(len(reward_list)))
    with open('./resultB.json', 'w') as file:
        json.dump(reward_list, file)