Beispiel #1
0
def example_6_6():
    fig, ax = plt.subplots()
    fig.suptitle(f'Example 6.6 (Averaged over {EX_6_6_N_SEEDS} seeds)')
    ax.set_xlabel('Episodes')
    ax.set_ylabel(
        f'(Average of last {EX_6_6_N_AVG}) sum of rewards during episodes')
    ax.set_yticks(EX_6_6_YTICKS)
    ax.set_ylim(bottom=min(EX_6_6_YTICKS))
    n_ep = EX_6_6_N_EPS
    env = TheCliff()
    qlearning_alg = QLearning(env,
                              step_size=EX_6_5_STEP_SIZE,
                              gamma=UNDISCOUNTED,
                              eps=EX_6_5_EPS)
    sarsa_alg = Sarsa(env,
                      step_size=EX_6_5_STEP_SIZE,
                      gamma=UNDISCOUNTED,
                      eps=EX_6_5_EPS)
    qlearning_rew = np.zeros(n_ep)
    sarsa_rew = np.zeros(n_ep)
    for seed in range(EX_6_6_N_SEEDS):
        print(f"seed={seed}")
        qlearning_alg.seed(seed)
        qlearning_rew += qlearning_alg.q_learning(n_ep)
        sarsa_alg.seed(seed)
        sarsa_rew += sarsa_alg.on_policy_td_control(n_ep, rews=True)
    plt.plot(smooth_rewards(qlearning_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG),
             color='r',
             label='Q learning')
    plt.plot(smooth_rewards(sarsa_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG),
             color='b',
             label='Sarsa')
    plt.legend()
    plt.savefig('example6.6.png')
    plt.show()
Beispiel #2
0
def run_sarsa(start, goal, Xrange, Vrange, plot_data_pid):
    sarsa_plot_data = list()
    sarsa_plot_data.append(plot_data_pid)

    for i in range(1, 9):
        sarsa = Sarsa(start, goal, Xrange, Vrange, n=i)
        sarsa.train(epoch=EPOCH, max_episode_length=MAX_EPISODE_LENGTH)

        sarsa_plot_data.append(sarsa.episodes)

    plot_with_n(sarsa_plot_data)
Beispiel #3
0
def plot_sarsa(ax,
               n_ep,
               label=None,
               diags=False,
               stay=False,
               stoch=False,
               seed=0):
    env = WindyGridworld(diags, stay, stoch)
    alg = Sarsa(env,
                step_size=EX_6_5_STEP_SIZE,
                gamma=UNDISCOUNTED,
                eps=EX_6_5_EPS)
    alg.seed(seed)
    kwargs = {"label": label} if label else {}
    plt.plot(alg.on_policy_td_control(n_ep), **kwargs)
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False):
    """
    Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters
    :param algorithm: String
    :param track: List
    :param x_start: Int
    :param y_start: Int
    :param discount: Float
    :param learning_rate: Float
    :param threshold: Float
    :param max_iterations: Int
    :param epsilon: Float
    :param reset_on_crash: Boolean
    :return: None
    """
    with open(track) as f:
        specs = f.readline().strip().split(',')
        rows = int(specs[0])
        cols = int(specs[1])
        layout = f.read().splitlines()

        initial_state = (x_start, y_start, 0, 0)
        initial_action = (0, 0)

        agent = Car(initial_action, epsilon)
        environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash)

        if algorithm == 'value_iteration':
            value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent)
            value_iterator.run()
            path = value_iterator.extract_policy(initial_state)
            value_iterator.plot_max_diffs()
        elif algorithm == 'q_learning':
            q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = q_learner.run()
            q_learner.plot_avg_cost()
        elif algorithm == 'sarsa':
            sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = sarsa.run()
            sarsa.plot_avg_cost()
        else:
            print("No algorithm selected")
            return None
        draw_track(path, layout)
Beispiel #5
0
def main(minutes):
    logging.info('training started for {} minutes'.format(minutes))
    logging.info('max iterations: {}'.format(MAX_ITERATIONS))

    # q = loadQ(currency, interval)

    rewards = []
    errors = []
    ticks = []

    start_time = time.time()
    while (time.time() - start_time) < (minutes * 60):

        with Sarsa(MODEL_FILENAME) as sarsa:
            logging.info('sarsa execution')

        # q, r, error, tick = train(df_inner, q, alpha, epsilon, PERIODS, ACTIONS, pip_mul, info['std'])

        break
Beispiel #6
0
def main(n_iters=3000, n_games_per_update=10, n_max_steps=1000, n_bins=50):

    env = gym.make('CartPole-v0')
    model = Sarsa(actions=range(2), alpha=0.1, gamma=0.95, epsilon=0.5)

    cart_p_bins = pd.cut([-2.4, 2.4], bins=n_bins, retbins=True)[1][1:-1]
    cart_v_bins = pd.cut([-2.0, 2.0], bins=n_bins, retbins=True)[1][1:-1]
    pole_a_bins = pd.cut(
        [-math.radians(41.8), math.radians(41.8)], bins=n_bins,
        retbins=True)[1][1:-1]
    pole_v_bins = pd.cut([-3.0, 3.0], bins=n_bins, retbins=True)[1][1:-1]
    bins = (cart_p_bins, cart_v_bins, pole_a_bins, pole_v_bins)

    # training
    for iter in range(n_iters):
        finished_steps = []
        for game in range(n_games_per_update):
            obs = env.reset()
            state = build_state(obs, bins)
            action = model.choose_action(state)
            for step in range(n_max_steps):
                obs, reward, done, info = env.step(action)
                next_state = build_state(obs, bins)
                next_action = model.choose_action(next_state)
                model.update_q(state, action, reward, next_state, next_action)
                state = next_state
                action = next_action
                if done:
                    finished_steps.append(step)
                    break
        print("[%d / %d]: %.1f" %
              (iter, n_iters, (sum(finished_steps) / len(finished_steps))))

    # testing
    obs = env.reset()
    state = build_state(obs, bins)
    done = False
    count = 0
    while not done:
        env.render()
        action = model.choose_action(state, training=False)
        obs, reward, done, info = env.step(action)
        state = build_state(obs, bins)
        count += 1
    print(count)
Beispiel #7
0
class Windy4(Windy):
    'Exercicio 6.9 2a parte'

    def __wind__(self, col: int):
        w = Windy.__wind__(self, col)
        p = random.choice((0, 1, 2))
        if p == 1:  # uma casa acima
            w += 1
        elif p == 2:  # uma casa abaixo
            w -= 1
        return w


if __name__ == '__main__':
    model = Windy()
    p = Sarsa(model, alfa=0.5)
    episode = 0
    total = 0
    while episode < 1000:
        steps = p.estimate(model.start)
        total += steps
        episode += 1
        # print(episode, steps, total)

    s = model.start
    for s in model.episode(s):
        m = s.pi.n
        print('%s %s' % (s.n, m.name))

    # for n,s in model.states.items():
    #     print('%s %s' % (n, s.pi.n))
Beispiel #8
0
import numpy as np
import gym
import matplotlib.pyplot as plt
from collections import deque
from sarsa import Sarsa

N_EPISODES = 20

env = gym.make('Taxi-v3')
print("Number of States = {}".format(env.nS))
print("Number of Actions = {}".format(env.nA))
current_state = env.reset()

q = np.load("q-agent.npy")
td_agent = Sarsa(env.nS, env.nA, env)
td_agent.q = q

scores_window = deque(maxlen=10)
for i_episode in range(N_EPISODES):
    current_state = env.reset()
    done = False
    episode_reward = 0
    while not done:
        next_state, reward, done, _ = env.step(
            np.argmax(td_agent.q[current_state][:]))
        episode_reward += reward
        current_state = next_state
        env.render()
        print()
    episode_reward += reward
    scores_window.append(episode_reward)
Beispiel #9
0
            sarsa_table = RL.learn(str(state), action, reward, str(state_), action_)

            # 进入下一状态
            state = state_
            action = action_

            # 结束此回合
            if done:
                break

    # end of game
    print('game over')

    print(sarsa_table)

    env.destroy()


if __name__ == "__main__":

    # 创建游戏世界
    env = Maze()

    # 创建q表,初始化动作空间 0 1 2 3
    RL = Sarsa(actions=list(range(env.n_actions)))

    # Call function once after 100ms
    env.after(100, update)

    env.mainloop()
# avg = np.average(np.array(rewards), axis=0)
# std = np.std(np.array(rewards), axis=0)
# maximumEpisodes = avg.shape[0]
# plt.errorbar(np.array([i for i in range(maximumEpisodes)]), avg, std, marker='^', ecolor='g')
# plt.show()

type = "linear"
# best parameter, order 3, e 0.2, alpha 0.5
# best parameter, order 5, e 0.2, alpha 0.5
for e in [0.3]:#, 0.1, 0.01, 0.3, 0.4]:
    for order in [3]: #, 5]:
        for alpha in [0.01]:#, 0.0001, 0.0005, 0.0009, 0.001, 0.005, 0.009, 0.01, 0.05, 0.09, 0.1, 0.5, 0.9]:
            rewards = []
            print("Alpha: ", alpha)
            for t in tqdm(range(trails)):
                # print("Alpha: %s, Trail: %s" %(alpha, t))
                td = Sarsa(gamma, alpha, env, state_space, steps, e, plot=plot, order=order, discount=discount)
                td.train(episodes)
                rewards.append(td.reward)

            avg = np.average(np.array(rewards), axis=0)
            std = np.std(np.array(rewards), axis=0)
            maximumEpisodes = avg.shape[0]
            plt.errorbar(np.array([i for i in range(maximumEpisodes)]), avg, std, marker='^', ecolor='g')
            #name = "Sarsa/figures/%s/cartPole_type_%s_order%s_alpha%s_e%s.jpg" %(type, type,  order, alpha, e)
            name = "Grid_alpha%s_e%s.jpg" % (alpha, e)
            pickle.dump(avg, open(name, "wb"))
            plt.xlabel("Number of episodes")
            plt.ylabel("Total Reward")
            # plt.savefig(name)
            # plt.close()
Beispiel #11
0
    }

    for solver_name, solver_fn in mdp_solvers.items():
        print('Final result of {}:'.format(solver_name))
        policy_grids, utility_grids = solver_fn(iterations=25, discount=0.5)
        print(policy_grids[:, :, -1])
        print(utility_grids[:, :, -1])
        plt.figure()
        gw.plot_policy(utility_grids[:, :, -1])
        plot_convergence(utility_grids, policy_grids)
        plt.show()

    sa = Sarsa(num_states=(shape[0] * shape[1]),
               num_actions=4,
               learning_rate=0.8,
               discount_rate=0.9,
               random_action_prob=0.5,
               random_action_decay_rate=0.99,
               dyna_iterations=0)

    start_state = gw.grid_coordinates_to_indices(start)

    iterations = 1000
    ### IMPORTANT
    # you do need to write your own generate_experience function
    # based on either \epilon greedy or exploration function
    # make sure on your submission, you need to submit
    # a new rl_qlearn.py that has the updated gw.generate_experience_your_name
    ### IMPORTANT
    flat_policies, flat_utilities = sa.learn(start_state,
                                             gw.generate_experience,
Beispiel #12
0
import numpy as np

import helper
from sarsa import Sarsa
from qlearning import QLearning
from sarsa_expected import SarsaExpected

# File to run in order to generate all the plots sequentially
if __name__ == '__main__':
    data_X = np.arange(start=0, stop=10000, step=100)

    # Agent 1: Sarsa(0)
    data_Y1 = np.zeros((100, 1))
    for seed in range(50):
        print(f'Seed: {seed}')
        sarsa = Sarsa(seed=seed, num_actions=4, alpha=0.1)
        y = sarsa.run()
        data_Y1 += y
    data_Y1 /= 10
    helper.plotSingle(data_X, data_Y1, "Sarsa(0)")

    # Sarsa(0) with King's move
    data_Y2 = np.zeros((100, 1))
    for seed in range(50):
        print(f'Seed: {seed}')
        sarsa = Sarsa(seed=seed, num_actions=8, alpha=0.1)
        y = sarsa.run()
        data_Y2 += y
    data_Y2 /= 10
    helper.plotSingle(data_X, data_Y2, "Sarsa(0) with King's move")
Beispiel #13
0
# Create the grid World Env
env = gridWorldEnv(easy)
# Define the Max Epochs and Each Episode Length
maxEpisode = 5000
epLength = 50
# epsilon greedy Exploration Value
epsilon = 0.3
epsilonDecay = 0.999 # Decay parameter for epsilon

# State and Action Dimension
a_dim = env.action_space_size()
s_dim = env.observation_space_size()

# Initialize the Learning Agent
agent = Sarsa(s_dim, a_dim)

# Reward vector for Ploting
epReward = []
avgReward = []

# File Name for saving the Results to file
file_name = 'hw2_sarsa_easyon'


#Start Learning
for epochs in range(maxEpisode):
    state = env.reset()
    total_reward = 0    
    for h in range(epLength):
        action = agent.eGreedyAction(state, epsilon)
Beispiel #14
0
def digitize_fun(state):
    def bins(clip_min, clip_max, num):
        return np.linspace(clip_min, clip_max, num + 1)[1:-1]

    car_pos, car_v, pole_angle, pole_v = state
    result = [
        np.digitize(car_pos, bins(-2.4, 2.4, 4)),
        np.digitize(car_v, bins(-3.0, 3.0, 4)),
        np.digitize(pole_angle, bins(-0.5, 0.5, 4)),
        np.digitize(pole_v, bins(-2.0, 2.0, 4))
    ]
    x = sum([x * (4**i) for i, x in enumerate(result)])
    return x


q_f = Sarsa(digitize_fun, 0.2, 0.99, 0.15, [0, 1])

max_number_of_steps = 200  # 每一场游戏的最高得分

goal_average_steps = 195
num_consecutive_iterations = 100
last_time_steps = np.zeros(
    num_consecutive_iterations)  # 只存储最近100场的得分(可以理解为是一个容量为100的栈)

env = gym.make('CartPole-v0')
for episode in range(5000):
    observation = env.reset()  # 初始化本场游戏的环境
    episode_reward = 0
    action = q_f.get_actions(observation)
    next_action = action
    for t in range(max_number_of_steps):
Beispiel #15
0
    # Show the food.
    for f in food:
        pylab.annotate('food', xy=f, size=5, bbox=dict(boxstyle="round4,pad=.5", fc="0.8"), ha='center')
    
    
    for i in range(len(path) - 1):
        pylab.arrow(path[i][0], path[i][1], path[i+1][0] - path[i][0], path[i+1][1] - path[i][1])


# Parameters.
max_size = 20
food = [(0,8), (4,4), (1,1), (8,8), (6,2), (12, 15), (17,2), (4, 12), (17, 17), (12, 1)]

# Start the algorithm.
sarsa = Sarsa(BarnState((0,0), food, max_size), epsilon=0.1, alpha=0.1, gamma=0.2)
sarsa.seed(int(100* time.time()))

plot_in = [10, 100, 200, 400, 600, 1000, 1500, 2000, 4000, 5000, 6000, 8000, 10000, 12000, 15000, 20000] 
for i in range(max(plot_in) + 1):
    sarsa.iterate()

    if i % 10 == 0:
        print i
    
    if i in plot_in:
        plot_path([s.position for s in sarsa.history])
        pylab.savefig('/tmp/simple-path-4-%d.png' % i)
        print i

Beispiel #16
0
def play(agentType="qlearning",
         worldNumber=0,
         eps=0.1,
         alpha=0.001,
         gamma=0.999):
    # env.action_space : ens des actions possibles
    # env.action_space.n : nombre d'actions possibles
    # env.observation_space : ens des états possibles
    # env.observation_space : nombre d'états possibles

    env = gym.make("gridworld-v0")  # Init un environnement

    # setPlan(arg1, arg2)
    # arg1 : fichier de la carte à charger
    # arg2 : liste de récompenses associées aux différents types de cases du jeu
    env.setPlan("gridworldPlans/plan" + str(worldNumber) + ".txt", {
        0: -0.001,
        3: 1,
        4: 1,
        5: -1,
        6: -1
    })

    env.verbose = True

    if agentType == "qlearning":
        agent = Q_Learning(env, eps, alpha, gamma)

    elif agentType == "sarsa":
        agent = Sarsa(env, eps, alpha, gamma)

    elif agentType == "dynaq":
        agent = Dyna_Q(env, eps, alpha, gamma)

    else:
        agent = Q_Learning(env, eps, alpha, gamma)
        print("Agent inconnu : qlearning par défaut")

    # Faire un fichier de log sur plusieurs scenarios
    outdir = 'gridworld-v0/random-agent-results'
    envm = wrappers.Monitor(env,
                            directory=outdir,
                            force=True,
                            video_callable=False)

    #countActions = []
    countRewards = []

    episode_count = 2000
    reward = 0
    done = False
    rsum = 0
    FPS = 0.001

    for i in tqdm(range(episode_count)):
        obs = envm.reset()
        env.verbose = (i % 100 == 0 and i > 0)  # afficher 1 episode sur 100
        if env.verbose:
            env.render(FPS)
            env.render(mode="human")
        j = 0
        rsum = 0
        while True:
            action = agent.action(obs, reward)
            obs, reward, done, _ = envm.step(action)
            rsum += reward
            j += 1
            if env.verbose:
                env.render(FPS)
            if done:
                print("Episode : " + str(i) + " rsum=" + str(rsum) + ", " +
                      str(j) + " actions")
                #countActions.append(j)
                countRewards.append(rsum)
                break

    np.save(
        "rewards_gridworld_" + str(worldNumber) + "_" + agentType +
        "_alpha0_1.npy", countRewards)
    print("Mean & std : ", np.mean(countRewards), np.std(countRewards))
    print("Reward cum : ", np.sum(countRewards))
    print("done")
    env.close()

    return countRewards
if __name__ == '__main__':
    args = get_cmd_args()

    alpha = args.learning_rate
    gamma = args.discount_rate
    epsilon = args.greedy_rate

    actions_number = args.actions_number
    gridworld_height = args.gridworld_height
    gridworld_width = args.gridworld_width

    episode_number = args.episode_number

    background_introduction = '''
	----------- Windy Gridworld with King's Moves -----------

		        1. Learning  Rate: \033[1;31m%.2f\033[0m
		        2. Discount  Rate: \033[1;31m%.2f\033[0m
		        3. Greedy    Rate: \033[1;31m%.2f\033[0m
		        4. Action  Number: \033[1;31m%d\033[0m
		        5. Episode Number: \033[1;31m%d\033[0m

	''' % (alpha, gamma, epsilon, actions_number, episode_number)

    print(background_introduction)

    sarsa = Sarsa(alpha, gamma, epsilon, actions_number, gridworld_height,
                  gridworld_width, episode_number)

    sarsa.sarsa()
Beispiel #18
-1
    for n in range(1, number_of_scenarios + 1):

        # Randomly locate the food on the barn.
        amount_food = randint(max_size / 2, 2 * max_size)
        food = []

        while len(food) < amount_food:

            # Add a new piece of food.
            food.append((randint(0, max_size-1), randint(0, max_size-1)))

            # Ensure uniqueness.
            food = list(set(food))

        # Start the algorithm.
        sarsa = Sarsa(BarnState((0,0), food, max_size), epsilon=epsilon, alpha=alpha, gamma=gamma)
        sarsa.seed(int(100 * time.time()))

        # keep track of how much do we move the q.
        track = []

        for it in range(1, max_iters + 1):

            if it % 10 == 0:
                print "Scenario %d: %d/%d\r" % (n, it, max_iters) ,
                sys.stdout.flush()

            history, corrections = sarsa.iterate()
            track.append(numpy.sqrt(sum(map(lambda x: x*x, corrections))))
            
            # We're just selecting nice places to evaluate the current policy and create a picture.