Esempio n. 1
0
File: Q4.py Progetto: ohad-se/RL3
def run_Q_learning(seed,
                   epsilon_current=0.1,
                   max_episodes=10000,
                   epsilon_decrease=1.,
                   start_at_bottom=False):
    env = MountainCarWithResetEnv()
    np.random.seed(seed)
    env.seed(seed)

    gamma = 0.999
    learning_rate = 0.05
    epsilon_min = 0.05

    solver = Solver(
        # learning parameters
        gamma=gamma,
        learning_rate=learning_rate,
        # feature extraction parameters
        number_of_kernels_per_dim=[7, 5],
        # env dependencies (DO NOT CHANGE):
        number_of_actions=env.action_space.n,
    )

    bottom_state = np.asarray([-0.5, 0])
    bottom_state_val = []
    success_rates = []
    episodes_gain = []
    episodes_bellman_err = []
    for episode_index in range(1, max_episodes + 1):
        episode_gain, mean_delta = run_episode(env,
                                               solver,
                                               is_train=True,
                                               epsilon=epsilon_current,
                                               start_at_bottom=start_at_bottom)
        episodes_gain.append(episode_gain)
        # reduce epsilon if required
        epsilon_current *= epsilon_decrease
        epsilon_current = max(epsilon_current, epsilon_min)
        episodes_bellman_err.append(mean_delta)
        bottom_state_features = solver.get_features(bottom_state)
        bottom_state_max_action = solver.get_max_action(bottom_state)
        bottom_state_val.append(
            solver.get_q_val(bottom_state_features, bottom_state_max_action))

        # termination condition:
        if episode_index % 10 == 9:
            test_gains = [
                run_episode(env, solver, is_train=False, epsilon=0.)[0]
                for _ in range(10)
            ]
            mean_test_gain = np.mean(test_gains)
            success_rates.append(np.mean(np.asarray(test_gains) > -200))
            print(f'tested 10 episodes: mean gain is {mean_test_gain}')
            if mean_test_gain >= -75.:
                print(f'solved in {episode_index} episodes')
                break

    return episodes_gain, success_rates, bottom_state_val, episodes_bellman_err
Esempio n. 2
0
    seeds = [123]
    epsilons = [1]

    gamma = 0.999
    learning_rate = 0.05
    epsilon_decrease = 0.99
    epsilon_min = 0.01

    max_episodes = 10000

    seed_rewards, seed_performance, seed_bottom_val, seed_bellman_err_avg = [], [], [], []
    for seed in seeds:
        env = MountainCarWithResetEnv()
        np.random.seed(seed)
        env.seed(seed)

        solver = Solver(
            # learning parameters
            gamma=gamma,
            learning_rate=learning_rate,
            # feature extraction parameters
            number_of_kernels_per_dim=[5, 7],
            # env dependencies (DO NOT CHANGE):
            number_of_actions=env.action_space.n,
        )

        for epsilon_current in epsilons:
            rewards, performance, bottom_val, bellman_err_avg, bellman_err = [], [], [], [], []
            for episode_index in range(0, max_episodes):
                episode_gain, mean_delta = run_episode(env,
Esempio n. 3
0
def run_q_learning_training(seed, epsilon=0.1, max_episodes=1000):
    env = MountainCarWithResetEnv()
    np.random.seed(seed)
    env.seed(seed)

    gamma = 0.999
    learning_rate = 0.01

    max_episodes = max_episodes
    solver = Solver(
        # learning parameters
        gamma=gamma,
        learning_rate=learning_rate,
        # feature extraction parameters
        number_of_kernels_per_dim=[7, 5],
        # env dependencies (DO NOT CHANGE):
        number_of_actions=env.action_space.n,
    )
    train_statistics = defaultdict(list)

    bellman_error = list()
    bellman_error_index = 100
    for episode_index in range(1, max_episodes + 1):
        episode_gain, mean_delta = run_episode(env,
                                               solver,
                                               is_train=True,
                                               epsilon=epsilon)
        bellman_error.append(mean_delta)

        print(
            f'After {episode_index}, reward = {episode_gain}, epsilon {epsilon}, average error {mean_delta}'
        )
        env.reset()
        init_state = env.state
        phi_st_0 = solver.get_state_action_features(init_state, 0)
        phi_st_1 = solver.get_state_action_features(init_state, 1)
        phi_st_2 = solver.get_state_action_features(init_state, 2)
        Q_st_0 = phi_st_0.transpose() @ solver.theta
        Q_st_1 = phi_st_1.transpose() @ solver.theta
        Q_st_2 = phi_st_2.transpose() @ solver.theta

        train_statistics["init_state"].append(max(Q_st_0, Q_st_1, Q_st_2))
        train_statistics["reward"].append(episode_gain)

        if episode_index % 100 == 99:
            train_statistics["bellman_error"].append(np.mean(bellman_error))
            train_statistics["bellman_error_index"].append(bellman_error_index)
            bellman_error_index += 100
            bellman_error = list()

        if episode_index % 10 == 9:
            test_gains = [
                run_episode(env, solver, is_train=False, epsilon=0.)[0]
                for _ in range(10)
            ]
            mean_test_gain = np.mean(test_gains)
            train_statistics["performance"].append(mean_test_gain)

            print(f'tested 10 episodes: mean gain is {mean_test_gain}')
            if mean_test_gain >= -75.:
                print(f'solved in {episode_index} episodes')
                break

    return train_statistics