Exemple #1
0
 def q_learning(self,
                gamma=0.9,
                alpha=0.1,
                alpha_decay=0.99,
                alpha_min=0.1,
                epsilon=1.0,
                epsilon_min=0.1,
                epsilon_decay=0.99,
                n_iter=10000,
                returnStats=False):
     ql = mdp.QLearning(self.prob,
                        self.rewards,
                        gamma,
                        alpha=alpha,
                        alpha_decay=alpha_decay,
                        alpha_min=alpha_min,
                        epsilon=epsilon,
                        epsilon_min=epsilon_min,
                        epsilon_decay=epsilon_decay,
                        n_iter=n_iter)
     run_stats = ql.run()
     # self.plot(run_stats, 'Frozen Lake - Q-Learning')
     expected_values = ql.V
     optimal_policy = ql.policy
     time = ql.time
     if (not returnStats):
         return [
             expected_values, optimal_policy,
             len(run_stats), time,
             np.sum([rs['Mean V'] for rs in run_stats])
         ]
     return run_stats, optimal_policy
Exemple #2
0
def runQItsByDiscount(transitions, rewards, envName, itRange):

    discountRange = np.linspace(.01, .99, 20)
    # discountRange = [.01, .1, .25, .5, .90, .98]

    stats = []
    for i in itRange:
        print()
        print('{0} Itt: {1}'.format(envName, i))
        for discount in discountRange:
            alg = mdp.QLearning(transitions, rewards, discount, n_iter=i)
            result = alg.run()
            runStats = alg.run_stats[-1]
            stat = [
                i, discount, alg.time, runStats['Iteration'],
                runStats['Error'],
                np.mean(alg.V), runStats['Reward'], alg.policy
            ]
            printStat(stat)
            stats.append(stat)
            save('{0} {1} discount policy'.format(envName, i), alg.policy)

    statsArr = np.array(stats)
    save('{0} discount stats'.format(envName), statsArr)

    roundedDiscounts = [round(x, 3) for x in discountRange]
    title = '{0} Q Learning - Time by Discount Factor'.format(envName)
    fig, ax = plt.subplots()
    for i in itRange:
        iStats = statsArr[statsArr[:, 0] == i]
        times = iStats[:, 2]
        plotQAx(ax, discountRange, times, title, 'Discount Factor', 'Time',
                'iterations {0}'.format(i))
    show(title, fig, ax)

    title = '{0} Q Learning - Error by Discount Factor'.format(envName)
    fig, ax = plt.subplots()
    for i in itRange:
        iStats = statsArr[statsArr[:, 0] == i]
        errors = iStats[:, 4]
        plotQAx(ax, discountRange, errors, title, 'Discount Factor', 'Error',
                'iterations {0}'.format(i))
    show(title, fig, ax)

    title = '{0} Q Learning - Reward by Discount Factor'.format(envName)
    fig, ax = plt.subplots()
    for i in itRange:
        iStats = statsArr[statsArr[:, 0] == i]
        rewards = iStats[:, 5]
        plotQAx(ax, discountRange, errors, title, 'Discount Factor', 'Reward',
                'iterations {0}'.format(i))
    show(title, fig, ax)

    stats = sorted(stats, key=lambda x: x[5], reverse=True)
    topPolicy = stats[0]
    topPolicy = topPolicy[-1]
    return topPolicy
Exemple #3
0
def getQLFrames(env, P, R):
    # epsilon_min=0.1
    # ln(0.1)/ln(epsilon_decay) == iteration of e_min
    # alpha_min=0.001
    # ln(0.001)/ln(alpha_decay) == iteration of a_min
    np.random.seed(1)
    ql = mdp.QLearning(P, R, 0.9, n_iter=100000, alpha_decay=0.99999, epsilon_decay=0.9)
    ql.setVerbose()
    run_stats = ql.run()
    return [step['Value'].reshape(env.nrow, env.ncol) for step in run_stats]
Exemple #4
0
def q_learning(P,
               R,
               gamma=0.99,
               alpha=0.1,
               alpha_decay=0.99,
               alpha_min=0.05,
               epsilon=1.0,
               e_min=0.1,
               e_decay=0.9999,
               n_iter=100000,
               plot=False,
               show=False,
               output="output",
               problem_name="Forest",
               callback=None):
    if alpha < alpha_min:
        alpha_min = alpha
    if epsilon < e_min:
        e_min = epsilon

    args = {
        "alpha": alpha,
        "alpha_decay": alpha_decay,
        "alpha_min": alpha_min,
        "epsilon": epsilon,
        "epsilon_min": e_min,
        "epsilon_decay": e_decay,
        "n_iter": n_iter,
        "iter_callback": callback if problem_name != "Forest" else None
    }
    ql = mdp.QLearning(P, R, gamma, **args)
    ql_results = ql.run()

    if plot:
        rewards = [i['Mean V'] for i in ql_results]
        iterations = [i['Iteration'] for i in ql_results]
        desc = 'Q-Learning'

        # plot and log results
        plt.clf()
        plt.plot(iterations, rewards)
        plt.title(f"{problem_name}: {desc}: Mean Utility over Iterations")
        plt.ylabel("Mean Utility")
        plt.xlabel("Iterations")
        plt.tight_layout()
        plt.savefig(f"{output}/{problem_name}-{desc}-utility.png")
        if show:
            plt.plot()
        else:
            plt.close()
        print(
            f'Q Learning time: {ql.time}\npolicy: {illustrate_policy(ql.policy, problem_name)}'
        )
    return ql, ql_results
Exemple #5
0
def q_learing_rate_decay(P, R):
    decays = [0.99, 0.9, 0.7, 0.5]
    q_stats = []
    for decay in decays:
        q = mdp.QLearning(P,
                          R,
                          0.9,
                          alpha=0.5,
                          alpha_decay=decay,
                          n_iter=max_iter).run()
        q_stats.append(q)

    return q_stats, decays
Exemple #6
0
def q_learing_rate_init(P, R):
    rates = [0.01, 0.05, 0.1, 0.2, 0.3]
    q_stats = []
    for rate in rates:
        q = mdp.QLearning(P,
                          R,
                          0.9,
                          alpha=rate,
                          epsilon_decay=1,
                          n_iter=max_iter).run()
        q_stats.append(q)

    return q_stats, rates
Exemple #7
0
def q_gamma(P, R):
    gammas = np.linspace(0.05, 0.95, 3)
    q_stats = []
    for gamma in gammas:
        q = mdp.QLearning(P,
                          R,
                          gamma,
                          alpha=0.2,
                          alpha_decay=0.99,
                          epsilon_decay=0.99,
                          n_iter=max_iter).run()
        q_stats.append(q)

    return q_stats, gammas
Exemple #8
0
def q_decay_rate(P, R):
    decays = [0.99, 0.9, 0.7, 0.5]
    q_stats = []
    for decay in decays:
        q = mdp.QLearning(P,
                          R,
                          0.9,
                          alpha=0.01,
                          alpha_decay=0.99,
                          epsilon_decay=decay,
                          n_iter=max_iter).run()
        q_stats.append(q)

    return q_stats, decays
Exemple #9
0
def run_QL_experiments(problem, name, load=True):
    data = []
    if load:
        with open(os.path.join(out, 'QL_results_' + name + '.pkl'), 'rb') as f:
            df = pickle.load(f)
        return df

    print("===========\nQ-Learning\n==========")
    for gamma in config['ql_discount']:
        for alpha in config['ql_alpha']:
            for eps in config['ql_epsilon']:
                for param in [config['ql_params'][name]]:
                    P, R = problem.p(**param)
                    mdp_util.check(P, R)
                    for n in config['ql_iters']:
                        ql = MDP.QLearning(P,
                                           R,
                                           gamma,
                                           alpha=alpha,
                                           epsilon=eps,
                                           n_iter=n)
                        run_stats = ql.run()
                        data.append([
                            problem.name, ql.S, ql.A, ql.gamma, alpha, eps,
                            [i['Time'] for i in run_stats], ql.max_iter,
                            [i['Mean V'] for i in run_stats],
                            np.std(ql.V), [i['Max V'] for i in run_stats],
                            [i['Reward'] for i in run_stats],
                            [i['Error'] for i in run_stats], ql.policy
                        ])
                        print(problem.name, ql.S, ql.A, gamma, n, alpha, eps)
    df = pd.DataFrame(data,
                      columns=[
                          'name', '#states', '#actions', 'discount', 'alpha',
                          'epsilon', 'time', 'iter', 'mean_V', 'max_V',
                          'std_V', 'reward', 'error_mean', 'policy'
                      ])

    with open(os.path.join(out, 'QL_results_' + name + '.pkl'), 'wb') as f:
        pickle.dump(df, f)
    return df
Exemple #10
0
def frozen_lake_ql(P, R, gamma_range, mapping, shape):
    print("== Q Learning Iteration ==")
    print("gamma    #Iterations     time (ms)")
    prev_policy = []
    prev_gamma = 0
    no_diff_list = []
    standard_policy = []
    for gamma in gamma_range:
        ql = mdp.QLearning(P, R, gamma, n_iter=10e4)
        ql.run()

        timestr = "%0.3f" % (ql.time * 1000)
        atab = " \t"
        spacing = 3

        gamma_str = "%0.2f" % gamma
        msg = gamma_str + atab * spacing + timestr

        print(msg)
        if gamma == 0.95:
            standard_policy.append((ql.policy, mapping, shape))

        if list(ql.policy) == list(prev_policy):
            no_diff_list.append([prev_gamma, gamma])

        prev_policy = ql.policy
        prev_gamma = gamma
    print()
    print("Q Learning Iteration Policy at Gamma = 0.95")
    contents = standard_policy.pop()
    print_policy(contents[0], contents[1], contents[2])
    print()
    no_diff_len = len(no_diff_list)
    str_list = ["No Policy Difference Between These Gammas: "] * no_diff_len
    policy_diffs = zip(str_list, no_diff_list)
    for diff in policy_diffs:
        print("%s %0.2f %0.2f" % (diff[0], diff[1][0], diff[1][1]))
    print()
Exemple #11
0
def main():

    print("Create a frozen lake of Size 10x10")
    p = generate_FrozenLake(size=10)
    num_states = len(p)
    num_actions = len(p[0])
    print("Num of States:", num_states)
    print("Num of Actions:", num_actions)
    P = np.zeros((num_actions, num_states, num_states))
    R = np.zeros((num_actions, num_states, num_states))

    for i in range(num_states):
        for j in range(num_actions):
            sum = 0
            for prob, next_state, rewards, done in p[i][j]:
                P[j][i][next_state] += prob
                R[j][i][next_state] = rewards
                sum += prob

    # VI
    for gamma in [.9, 0.6]:
        vi = mdp.ValueIteration(transitions=P,
                                reward=R,
                                gamma=gamma,
                                epsilon=0.000001,
                                max_iter=5000)
        stats_data = vi.run()
        plot_mpd_graph(
            stats_data,
            'VI Frozen_Lake(10x10), Gamma={}, Reward plot'.format(gamma),
            'Reward', 'Reward')

        plot_mpd_graph(
            stats_data,
            'VI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma),
            'Time(seconds)', 'Time')

    # PI
    for gamma in [.9, 0.6]:
        print('PI {}'.format(gamma))
        pi = mdp.PolicyIteration(transitions=P,
                                 reward=R,
                                 gamma=gamma,
                                 max_iter=5000,
                                 eval_type=1)
        stats_data = pi.run()
        plot_mpd_graph(
            stats_data,
            'PI Frozen_Lake(10x10), Gamma={}, error plot'.format(gamma),
            'Error', 'Error')

        plot_mpd_graph(
            stats_data,
            'PI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma),
            'Time(seconds)', 'Time')

    # QLearning
    for alpha in [0.1, 0.4]:
        qlearn = mdp.QLearning(transitions=P,
                               reward=R,
                               gamma=0.6,
                               alpha=alpha,
                               alpha_decay=0.1,
                               alpha_min=0.0001,
                               epsilon=0.1,
                               epsilon_min=0.9,
                               epsilon_decay=0,
                               n_iter=10000)
        stats_data = qlearn.run()
        plot_mpd_graph(
            stats_data,
            'Qlearning Frozen_Lake(10x10),  alpha={}, Error plot'.format(
                alpha), 'Error', 'Error')

        plot_mpd_graph(
            stats_data,
            'Qlearning Frozen_Lake(10x10),  alpha={}, Reward plot'.format(
                alpha), 'Reward', 'Reward')

        plot_mpd_graph(
            stats_data,
            'Qlearning Frozen_Lake(10x10),  alpha={}, Time PLot'.format(alpha),
            'Time(seconds)', 'Time')
Exemple #12
0
        ttt = mdp.ValueIteration(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start

    for discount in np.arange(.1, 1, .2):
        ttt = mdp.PolicyIteration(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start

    for discount in np.arange(.1, 1, .2):
        qlearner_stats = collections.defaultdict(list)
        ttt = hmdp.QLearning(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start
        for stats in ttt.run_stats:
            qlearner_stats['state'].append(stats['State'])
            qlearner_stats['action'].append(stats['Action'])
            qlearner_stats['reward'].append(stats['Reward'])
            qlearner_stats['error'].append(stats['Error'])
            qlearner_stats['time'].append(stats['Time'])
            qlearner_stats['alpha'].append(stats['Alpha'])
            qlearner_stats['epsilon'].append(stats['Epsilon'])
            qlearner_stats['max_v'].append(stats['Max V'])
            qlearner_stats['mean_v'].append(stats['Mean V'])
        qlearner_stats_df = pd.DataFrame(qlearner_stats)
def run_forest():
    np.random.seed(0)
    P, R = example.forest(S=5, r1=3, r2=15, p=0.2)
    print("Transition Array: ")
    print(P.shape)
    print(P)  # Transition array A x S x S
    print("Reward Array: ")
    print(R.shape)
    print(R)  # Reward array S x A

    # TODO
    gamma_range = np.array([0.1, 0.9, 0.99])
    alpha_range = np.array([0.01, 0.5, 0.99])
    epsilon_range = np.array([0.1, 0.5, 0.95])
    e_decay_range = np.array([0.1, 0.5, 0.999])

    # gamma_range = np.append(np.linspace(0.1, 0.9, 9), np.linspace(0.91, 0.99, 9))
    # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4))
    # epsilon_range = np.linspace(0.1, 1.0, 10)
    # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9))

    difference_list = np.zeros(gamma_range.shape)
    value_iteration_list = np.zeros(gamma_range.shape)
    value_time_list = np.zeros(gamma_range.shape)
    value_reward_list = np.zeros(gamma_range.shape)
    value_error_list = np.zeros(gamma_range.shape)

    policy_iteration_list = np.zeros(gamma_range.shape)
    policy_time_list = np.zeros(gamma_range.shape)
    policy_reward_list = np.zeros(gamma_range.shape)
    policy_error_list = np.zeros(gamma_range.shape)

    for i, gamma in enumerate(gamma_range):
        print('Gamma %0.2f' % gamma)

        vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=10000)
        vi.setVerbose()
        vi.run()
        vi_stats = vi.run_stats
        value_iteration_list[i] = vi_stats[-1:][0]['Iteration']
        value_time_list[i] = vi_stats[-1:][0]['Time']
        value_reward_list[i] = vi_stats[-1:][0]['Reward']
        value_error_list[i] = vi_stats[-1:][0]['Error']
        plot_stats(vi_stats, ('vi_forest_%0.2f' % gamma))

        pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=10000, eval_type=1)
        pi.setVerbose()
        pi.run()
        stats = pi.run_stats
        policy_iteration_list[i] = stats[-1:][0]['Iteration']
        policy_time_list[i] = stats[-1:][0]['Time']
        policy_reward_list[i] = stats[-1:][0]['Reward']
        policy_error_list[i] = stats[-1:][0]['Error']
        plot_stats(stats, ('pi_forest_%0.2f' % gamma))
        print('Policies Found')
        print('Value Iteration: ' + str(vi.policy))
        print('Policy Iteration: ' + str(pi.policy))

        difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)])
        difference_list[i] = difference1
        print("Discrepancy in Policy and Value Iteration: ", difference1)
        print()

    # Plotting
    # Error v Iteration
    plt.clf()
    plt.title('Value Iteration: Error v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Error')
    plt.plot(list(value_iteration_list), list(value_error_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_error_v_iteration.png')

    # Reward v Gamma
    plt.clf()
    plt.title('Value Iteration: Reward v Gamma')
    plt.xlabel('Gamma')
    plt.ylabel('Reward')
    plt.plot(list(gamma_range), list(value_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_reward_v_gamma.png')

    # Gamma v Iterations
    plt.clf()
    plt.title('Value Iteration: Gamma v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Gamma')
    plt.plot(list(value_iteration_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_gamma_v_iterations.png')

    # Gamma v Time
    plt.clf()
    plt.title('Value Iteration: Gamma v Time')
    plt.xlabel('Time')
    plt.ylabel('Gamma')
    plt.plot(list(value_time_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_gamma_v_time.png')

    # Reward vs Iterations
    plt.clf()
    plt.title('Value Iteration: Reward v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Reward')
    plt.plot(list(value_iteration_list), list(value_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_reward_v_iterations.png')

    # Policy
    # Error v Iteration
    plt.clf()
    plt.title('Policy Iteration: Error v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Error')
    plt.plot(list(policy_iteration_list), list(policy_error_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_error_v_iteration.png')

    # Gamma v Reward
    plt.clf()
    plt.title('Policy Iteration: Reward v Gamma')
    plt.xlabel('Gamma')
    plt.ylabel('Reward')
    plt.plot(list(gamma_range), list(policy_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_reward_v_gamma.png')

    # Gamma v Iterations
    plt.clf()
    plt.title('Policy Iteration: Gamma v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Gamma')
    plt.plot(list(policy_iteration_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_gamma_v_iterations.png')

    # Gamma v Time
    plt.clf()
    plt.title('Policy Iteration: Gamma v Time')
    plt.xlabel('Time')
    plt.ylabel('Gamma')
    plt.plot(list(policy_time_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_gamma_v_time.png')

    # Reward vs Iterations
    plt.clf()
    plt.title('Policy Iteration: Reward v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Reward')
    plt.plot(list(policy_iteration_list), list(policy_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_reward_v_iterations.png')

    # Gamma vs Policy Differences
    plt.clf()
    plt.title('Gamma v Policy Differences')
    plt.xlabel('Gamma')
    plt.ylabel('Policy Differences')
    plt.plot(list(gamma_range), list(difference_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/gamma_v_differences.png')
    plt.close('all')

    prev_Q = None
    thresh = 1e-4
    print('== Q Learning ==')
    for i, gamma in enumerate(gamma_range):
        for j, alpha in enumerate(alpha_range):
            for k, ep in enumerate(epsilon_range):
                for l, ed in enumerate(e_decay_range):
                    # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed))
                    ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001,
                                       epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4)
                    stats = ql.run()
                    plot_stats(stats, ('ql_forest_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed)))

                    # print('Policy: ')
                    # print(ql.policy)
                    # print(ql.run_stats)

                    df = pd.DataFrame.from_records(ql.run_stats)
                    iteration_list = df['Iteration'][-100:]
                    windowed_reward = df['Reward'][-100:].mean()
                    error_list = df['Error'][-100:].mean()

                    if prev_Q is None:
                        prev_Q = ql.Q
                    else:
                        variation = np.absolute(np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max()
                        res = np.abs(np.subtract(np.asarray(prev_Q), np.asarray(ql.Q)))
                        print('Result: ')
                        print(res)
                        print('Variation: ')
                        print(variation)
                        print('Mean Reward for Last 100 Iterations:')
                        print(windowed_reward)
                        if np.all(res < thresh) or variation < thresh or windowed_reward > 1.0:
                            print('Breaking! Below Thresh')
                            print('Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(
                                gamma, alpha, ep, ed))
                            print('Optimal Policy: ')
                            print(ql.policy)
                            break
Exemple #14
0
def comparing_mdps(P, R, mapping, shape):
    print("Comparing the Two Policies")
    vi = mdp.ValueIteration(P, R, 0.9, max_iter=10000)
    vi.run()
    print("Value Function: ")
    print(vi.V)
    print("Policy: ")
    print(vi.policy)
    print_policy(vi.policy, mapping, shape)
    print("Iter: ")
    print(vi.iter)
    print("Time: ")
    print(vi.time)
    # print(vi.run_stats)
    print()
    pi = mdp.PolicyIteration(P, R, 0.9, max_iter=100000)
    pi.run()
    print("Policy Function: ")
    print(pi.V)
    print("Policy: ")
    print(pi.policy)
    print_policy(pi.policy, mapping, shape)
    print("Iter: ")
    print(pi.iter)
    print("Time: ")
    print(pi.time)
    # print(pi.run_stats)
    print()
    pim = mdp.PolicyIterationModified(P, R, 0.9, max_iter=100000, epsilon=0.05)
    pim.run()
    print("Policy Modified Function: ")
    print(pim.V)
    print("Policy: ")
    print(pim.policy)
    print_policy(pim.policy, mapping, shape)
    print("Iter: ")
    print(pim.iter)
    print("Time: ")
    print(pim.time)
    # print(pi.run_stats)
    print()
    ql = mdp.QLearning(
        P,
        R,
        0.9,
        n_iter=10e4,
        epsilon=0.1,
        epsilon_decay=0.1,
        epsilon_min=0.1,
    )
    ql.run()
    print("Q Learning Function: ")
    print(ql.V)
    print("Policy: ")
    print(ql.policy)
    print_policy(ql.policy, mapping, shape)
    print("Mean Discrepancy: ")
    print(ql.error_mean)
    # print(ql.v_mean)
    print("Epsilon: ")
    print(ql.epsilon)
    difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)])
    if difference1 > 0:
        print("Discrepancy in Policy and Value Iteration: ", difference1)
        print()
    difference2 = sum([abs(x - y) for x, y in zip(pim.policy, vi.policy)])
    if difference2 > 0:
        print("Discrepancy in Policy Modified and Value Iteration: ",
              difference2)
        print()
    difference3 = sum([abs(x - y) for x, y in zip(pim.policy, pi.policy)])
    if difference3 > 0:
        print("Discrepancy in Policy Modified and Policy Iteration: ",
              difference3)
        print()
    difference4 = sum([abs(x - y) for x, y in zip(vi.policy, ql.policy)])
    if difference4 > 0:
        print("Discrepancy in Q Learning and Value Iteration: ", difference4)
        print()
    difference5 = sum([abs(x - y) for x, y in zip(pi.policy, ql.policy)])
    if difference5 > 0:
        print("Discrepancy in Q Learning and Policy Iteration: ", difference5)
        print()
    difference6 = sum([abs(x - y) for x, y in zip(pim.policy, ql.policy)])
    if difference6 > 0:
        print("Discrepancy in Q Learning and Policy Iteration Modified: ",
              difference6)
        print()
def run(verbose=False):
    # env = gym.make('FrozenLake-v0', is_slippery=True)
    env = gym.make('FrozenLake8x8-v0', is_slippery=True)
    # env = gym.make('FrozenLake-v0')

    # Debug
    # print('env.P')
    # pprint(env.P)
    # print('env.R')
    # print(env.R)

    

    P, R = transform_for_MDPToolbox(env)
    # print('Reward')
    # print(R)
    # return

    print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Policy Iteration ~~~~~~~~~~')
    pi = mdp.PolicyIteration(P, R, 0.6, max_iter=100000)

    if verbose:
        pi.setVerbose()
    pi.run()
    util.print_debugs(pi)
    total_r_pi = render_env_policy(env, pi.policy, display=verbose)


    print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Value Iteration ~~~~~~~~~~')
    vi = mdp.ValueIteration(P, R, 0.6, epsilon=0.005, max_iter=10000)
    if verbose:
        vi.setVerbose()
    vi.run()
    util.print_debugs(vi)
    total_r_vi = render_env_policy(env, pi.policy, display=verbose)
    if(vi.policy == pi.policy):
        print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are the same! ')
    else:
        print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are NOT the same. ')


    print('~~~~~~~~~~ FrozenLake-v0 – Q-Learning ~~~~~~~~~~')
    ql = mdp.QLearning(P, R, 0.6, alpha=0.3, epsilon_min=0.005, n_iter=100000)
    if verbose:
        ql.setVerbose()
    start_t = time.process_time()
    ql.run()
    end_t = time.process_time()

    total_r_ql = render_env_policy(env, ql.policy, display=verbose)

# Output
    print('~~~~~~~~~~ FrozenLake-v0 - Policy Iteration ~~~~~~~~~~')
    util.print_debugs(pi)
    print('Total Reward: %f' %total_r_pi)
    print('~~~~~~~~~~ FrozenLake-v0 - Value Iteration ~~~~~~~~~~')
    util.print_debugs(vi)
    print('Total Reward: %f' %total_r_vi)
    print('~~~~~~~~~~ FrozenLake-v0 - Q-Learning ~~~~~~~~~~')
    print('Clock time')
    print(end_t - start_t)
    print('Total Reward: %f' %total_r_pi)
    print(ql.policy)
    
    if(vi.policy == pi.policy):
        print('FrozenLake-v0 - Value and Policy Iteration policies are the same! ')
    else:
        print('FrozenLake-v0 - Value and Policy Iteration policies are NOT the same.')
    
    
    if(vi.policy == ql.policy):
        print('FrozenLake-v0 – QL and VI Policies are the same!')
    else:
        print('FrozenLake-v0 – QL and VI Policies are NOT the same.')
    if(pi.policy == ql.policy):
        print('FrozenLake-v0 – PI and PI Policies are the same!')
    else:
        print('FrozenLake-v0 – PI and VI Policies are NOT the same.')

    print('VI Policy')
    print_policy(vi.policy)
    # print('PI Policy')
    # print_policy(vi.policy)
    print('QL Policy')
    print_policy(ql.policy)


    # Source: 
    #   https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/
    """
Exemple #16
0
def vi_pi_q_comp(P, R):
    vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run()
    pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run()
    q = mdp.QLearning(P, R, 0.6, alpha=0.2).run()
    return vi, pi, q
if algo == 'pi':
    solver = mdp.PolicyIteration(T, R, 0.9, max_iter=5000)
elif algo == 'vi':
    solver = mdp.ValueIteration(T,
                                R,
                                0.9,
                                epsilon=1e-6,
                                max_iter=5000,
                                initial_value=0)
elif algo == 'q':
    solver = mdp.QLearning(T,
                           R,
                           0.99,
                           alpha=1.0,
                           alpha_decay=0.9999993,
                           alpha_min=0.1,
                           epsilon=1.0,
                           epsilon_min=0.2,
                           epsilon_decay=0.999999,
                           n_iter=6e6,
                           run_stat_frequency=1e4)

solver.setVerbose()

start = time.time()
solver.run()
end = time.time()

if problem == 'forest':
    print(solver.policy)
elif problem == 'frozen':
Exemple #18
0
def runQAlphaByIts(transitions, rewards, envName, itRange):
    alphas = np.linspace(.9, .01, 10)
    alphas = [.01, .05, .10, .20, .25]
    discounts = [.1, .5, .9]
    allStats = []
    for discount in discounts:
        stats = []
        for i in alphas:
            print()
            print('{0} Alpha: {1} Discount:{2}'.format(envName, i, discount))
            for itt in itRange:
                alg = mdp.QLearning(transitions,
                                    rewards,
                                    discount,
                                    alpha=i,
                                    n_iter=itt)
                result = alg.run()
                runStats = alg.run_stats[-1]
                stat = [
                    i, discount, alg.time, runStats['Iteration'],
                    runStats['Error'],
                    np.mean(alg.V), runStats['Reward']
                ]
                printStat(stat)
                stats.append(stat)
                save(
                    '{0} {1} epsilon discount{2} policy'.format(
                        envName, i, discount), alg.policy)
        statsArr = np.array(stats)
        save('{0} {1} alpha stats'.format(envName, discount), statsArr)

        roundedAlphas = [round(x, 3) for x in alphas]
        title = '{0} Q Learning - Time by Alpha Discount {1}'.format(
            envName, discount)
        fig, ax = plt.subplots()
        for i in alphas:
            iStats = statsArr[statsArr[:, 0] == i]
            times = iStats[:, 2]
            plotQAx(ax, itRange, times, title, 'Iterations', 'Time',
                    'alpha {0:0.3f}'.format(i))
        show(title, fig, ax)

        title = '{0} Q Learning - Error by Alpha Discount {1}'.format(
            envName, discount)
        fig, ax = plt.subplots()
        for i in alphas:
            iStats = statsArr[statsArr[:, 0] == i]
            errors = iStats[:, 4]
            plotQAx(ax, itRange, errors, title, 'Iterations', 'Error',
                    'alpha {0:0.3f}'.format(i))
        show(title, fig, ax)

        title = '{0} Q Learning - Reward by Alpha Discount {1}'.format(
            envName, discount)
        fig, ax = plt.subplots()
        for i in alphas:
            iStats = statsArr[statsArr[:, 0] == i]
            rewardsArr = iStats[:, 5]
            plotQAx(ax, itRange, rewardsArr, title, 'Iterations', 'Reward',
                    'alpha {0:0.3f}'.format(i))
        show(title, fig, ax)
        allStats.extend(stats)

    allStats = sorted(allStats, key=lambda x: x[5], reverse=True)
    topPolicy = stats[0]
    topPolicy = topPolicy[-1]
    return topPolicy
Exemple #19
0
def run_forest(size):
    seed_val = 42
    np.random.seed(seed_val)
    random.seed(seed_val)

    S = size
    r1 = 10  # The reward when the forest is in its oldest state and action ‘Wait’ is performed
    r2 = 50  # The reward when the forest is in its oldest state and action ‘Cut’ is performed
    p = 0.1

    P, R = mdptoolbox.example.forest(S=S, r1=r1, r2=r2,
                                     p=p)  # Defaults left the same

    epsilons = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
    epsilons = [0.00001, 0.000001]
    gammas = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999, 0.9999, 0.99999]

    learning_rates = [
        0.001, 0.01, 0.00001, 0.0001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9,
        1.0
    ]
    lr_decays = [
        1.0, 0.99, 0.9999, 0.999, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1
    ]
    lr_mins = [0.00001, 0.0001, 0.001, 0.01, 0]
    epsilons = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
    epsilon_decays = [0.99, 0.9999, 0.99999, 0.999999, 0.999, 0.9, 0.8, 0.7]
    epsilon_mins = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0]

    best_lr, best_e, best_g, best_ed, best_em, best_rew = 0, 0, 0, 0, 0, -1
    '''for em in epsilon_mins:
        for am in lr_mins:
            for ad in lr_decays:
                for e in epsilons:
                    for g in gammas:
                        for a in learning_rates:
                            for ed in epsilon_decays:
                                pi = mdp.QLearning(P, R, gamma=g, epsilon=e, epsilon_decay=ed, epsilon_min=em, n_iter=10000,
                                                   alpha=a, alpha_min=am, alpha_decay=ad)
                                pi.run()
                                rew = run_episodes(pi.policy, S, R, p, 1000, 100)
                                print(rew, '-', e, ed, em, a, ad, am, g)'''

    # g	    e       	ed	    em  	a  	ad	am	    rew'''
    tests = [[0.1, 0.000001, 0.99, 0.0001, 0.6, 0.5, 0.001]]
    # g	    e       	ed	    em  	a  	ad	am	    rew
    # 0.1	1.00E-06	0.99	0.0001	0.6	0.5	0.001	4032

    # 0.1	1.00E-06	0.99	1.00E-05	0.001	0.5	0.01	429.2

    if size < 100:
        tests = [[0.1, 1.0, 0.7, 0.00001, 0.0001, 1.0, 0.00001]]
    else:
        tests = [[0.6, 1.0, 0.999999, 0.00001, 0.8, 1.0, 0.01]]

    if 1 == 1:
        # print(e, ed, em, a, ad, am, g, rew, )
        best_pol_arr = []

        print(size)
        for t in tests:
            for e in epsilons:
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=e,
                                            epsilon_decay=t[2],
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                best_pol_arr.append(list(Q_qlearning.policy))
                #print(run_episodes(Q_qlearning.policy, S, R, p, 100000, 100))

        # Plot out optimal policy
        # Citation: https://stackoverflow.com/questions/52566969/python-mapping-a-2d-array-to-a-grid-with-pyplot
        print(epsilons)
        cmap = colors.ListedColormap(['blue', 'red'])
        fig, ax = plt.subplots(figsize=(12, 3.5))
        plt.title("Forest Q-Learning Policy - Red = Cut, Blue = Wait")
        epsilons.reverse()

        plt.xticks(fontsize=15)
        plt.xlabel('State', fontsize=15)
        plt.ylabel('Epsilon', fontsize=15)
        plt.pcolor(best_pol_arr[::-1], cmap=cmap, edgecolors='k', linewidths=0)
        ax.set_yticklabels(epsilons, fontsize=15)
        ax.tick_params(left=False)  # remove the ticks
        plt.savefig('Images\\QL-Forest-Policy-' + str(size) + '.png')

        plt.show()

        mean_val = [i["Mean V"] for i in Q_qlearning.run_stats]
        error = [i["Error"] for i in Q_qlearning.run_stats]
        reward = [i["Reward"] for i in Q_qlearning.run_stats]

        # Plot Delta vs iterations
        fig, ax1 = plt.subplots()

        color = 'tab:blue'
        ax1.set_ylabel('Reward/Error', color=color)
        ax1.semilogy(error, color=color, label='Error')
        ax1.semilogy(reward, color='darkblue', label='Reward')
        ax1.legend()
        ax2 = ax1.twinx(
        )  # instantiate a second axes that shares the same x-axis

        color = 'tab:red'
        ax2.set_xlabel('Iterations')
        ax2.set_ylabel('Mean V', color=color)
        ax2.semilogy(mean_val, color=color)
        ax2.tick_params(axis='y', labelcolor=color)

        ax2.tick_params(axis='y', labelcolor=color)

        plt.title('V/Reward/Error vs. Iterations')
        plt.savefig('Images\\QL-Forest-RunStats' + str(size) + '.png')
        plt.show()

        best_rew = 0
        for em in epsilon_mins:
            for am in lr_mins:
                for ad in lr_decays:
                    for e in epsilons:
                        for g in gammas:
                            for a in learning_rates:
                                for ed in epsilon_decays:
                                    Q_qlearning = mdp.QLearning(
                                        P,
                                        R,
                                        gamma=g,
                                        epsilon=e,
                                        epsilon_decay=ed,
                                        epsilon_min=em,
                                        n_iter=10000,
                                        alpha=a,
                                        alpha_min=am,
                                        alpha_decay=ad)
                                    Q_qlearning.run()
                                    rew = run_episodes(Q_qlearning.policy, S,
                                                       R, p, 1000, 200)
                                    if rew > best_rew:
                                        best_rew = rew
                                        print(
                                            e,
                                            ed,
                                            em,
                                            a,
                                            ad,
                                            am,
                                            g,
                                            rew,
                                        )

    for t in tests:

        num_seeds = 10

        for g in gammas:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=g,
                                            epsilon=t[1],
                                            epsilon_decay=t[2],
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('g', x, g, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for em in epsilon_mins:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=t[1],
                                            epsilon_decay=t[2],
                                            epsilon_min=em,
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('em', x, em, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for e in epsilons:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=e,
                                            epsilon_decay=t[2],
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                #print(Q_qlearning.policy)
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('e', x, e, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for lr in learning_rates:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=t[1],
                                            epsilon_decay=t[2],
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=lr,
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('lr', x, lr, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for ld in lr_decays:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=t[1],
                                            epsilon_decay=t[2],
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=ld)
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('ld', x, ld, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for lm in lr_mins:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=t[1],
                                            epsilon_decay=t[2],
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=lm,
                                            alpha_decay=t[6])
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('lm', x, lm, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for e in epsilons:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=e,
                                            epsilon_decay=t[2],
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('e', x, e, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for ed in epsilon_decays:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=t[1],
                                            epsilon_decay=ed,
                                            epsilon_min=t[3],
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('ed', x, ed, rew, Q_qlearning.run_stats[-1]['Mean V'])

        for em in epsilon_mins:
            tot_rew = 0
            cnt = 0
            for x in range(num_seeds):
                cnt += 1
                seed_val = x
                np.random.seed(seed_val)
                random.seed(seed_val)
                Q_qlearning = mdp.QLearning(P,
                                            R,
                                            gamma=t[0],
                                            epsilon=t[1],
                                            epsilon_decay=t[2],
                                            epsilon_min=em,
                                            n_iter=10000,
                                            alpha=t[4],
                                            alpha_min=t[5],
                                            alpha_decay=t[6])
                Q_qlearning.run()
                rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100)
                tot_rew += rew
                print('em', x, em, rew, Q_qlearning.run_stats[-1]['Mean V'])
Exemple #20
0
def run(verbose=False):
    # MDP Forest Problem
    # transitions, reward = example.forest()
    nS = 1000
    # transitions, reward = example.forest(S=nS, r1=250, r2=120, p=0.01, is_sparse=False)
    transitions, reward = example.forest(S=nS,
                                         r1=1045,
                                         r2=1025,
                                         p=0.01,
                                         is_sparse=False)

    # print(transitions)
    # print (reward)
    # return
    print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~')
    pi = mdp.PolicyIteration(transitions, reward, 0.75, max_iter=10000)
    if verbose:
        pi.setVerbose()
    pi.run()
    util.print_debugs(pi)
    # print(pi.run_stats)
    # return

    print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~')
    vi = mdp.ValueIteration(transitions, reward, 0.75, max_iter=100000)
    if verbose:
        vi.setVerbose()
    vi.run()
    util.print_debugs(vi)

    if (vi.policy == pi.policy):
        print('Forest - Value and Policy Iteration policies are the same! ')
    else:
        print('Forest - Value and Policy Iteration policies are NOT the same.')

    print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~')
    # transitions, reward, gamma,
    #  alpha=0.1, alpha_decay=0.99, alpha_min=0.001,
    #  epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99,
    #  n_iter=10000, skip_check=False, iter_callback=None,
    #  run_stat_frequency=None):

    ql = mdp.QLearning(transitions,
                       reward,
                       0.75,
                       alpha=0.3,
                       epsilon_min=0.005,
                       n_iter=500000)
    if verbose:
        ql.setVerbose()
    start_t = time.process_time()
    ql.run()
    end_t = time.process_time()

    # Output
    print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~')
    util.print_debugs(pi)
    print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~')
    util.print_debugs(vi)
    print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~')
    print(ql.policy)
    print('Q-Learning # of Iterations: %i' % q_counter)
    print('Clock time')
    print(end_t - start_t)

    if (vi.policy == pi.policy):
        print('Forest - Value and Policy Iteration policies are the same! ')
    else:
        print('Forest - Value and Policy Iteration policies are NOT the same.')

    if (vi.policy == ql.policy):
        print('Forest – QL and VI Policies are the same!')
    else:
        print('Forest – QL and VI Policies are NOT the same.')
    if (pi.policy == ql.policy):
        print('Forest – PI and PI Policies are the same!')
    else:
        print('Forest – PI and VI Policies are NOT the same.')

    # A Q-Learning Algorithm
    #
    # Source:
    #   https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/
    """
Exemple #21
0
    tune_ql = False
    if tune_ql:

        # max iter
        if False:
            iter_range = [
                10**4, 5 * (10**4), 10**5, 5 * (10**5), 10**6, 5 * (10**6)
            ]
            ql_time = []
            ql_max_v = []

            for iter in iter_range:
                ql = mdp.QLearning(transitions,
                                   rewards,
                                   gamma=0.99,
                                   epsilon=1.0,
                                   n_iter=iter)
                ql.run()
                ql_time.append(ql.time)
                ql_max_v.append(np.max(ql.V))

            plt.figure()
            plt.plot(iter_range, ql_time, label="QL")
            plt.xlabel('iterations')
            plt.ylabel('time')
            plt.title('iteration vs time')
            plt.legend()
            plt.savefig("charts/lake_ql_iter_time")

            plt.figure()
Exemple #22
0
def run_qlearn(envs, gamma=0.96, n_iters=10000, verbose=True):
    all_rewards = []
    all_mean_discrepancies_dfs = []
    all_error_dfs = []
    time_per_run = []

    num_episodes = len(envs)
    for env, episode in zip(envs, range(num_episodes)):
        P, R = env
        fm_qlearn = mdp.QLearning(
            transitions=P,
            reward=R,
            gamma=gamma,
            n_iter=n_iters,
        )
        # if verbose: fm_qlearn.setVerbose()
        t0 = time()
        fm_qlearn.run()
        time_elapsed = time() - t0
        time_per_run.append(time_elapsed)
        if verbose:
            print("Forest Management QLearning Episode", episode,
                  "runtime (s):", time_elapsed)

        # add mean discrepancies for each episode
        v_means = []
        for v_mean in fm_qlearn.v_mean:
            v_means.append(np.mean(v_mean))
        v_mean_df = pd.DataFrame(v_means, columns=['v_mean'])
        # v_mean_df.iloc[0: n_iters / 100, :] = v_means

        all_mean_discrepancies_dfs.append(v_mean_df)
        if verbose:
            print("Forest Management QLearning Episode", episode,
                  "mean discrepancy:", '\n', v_mean_df, '\n')

        error_over_iters = fm_qlearn.error_over_iters
        # print(error_over_iters)
        error_plot_df = pd.DataFrame(0,
                                     index=np.arange(1, n_iters + 1),
                                     columns=['error'])
        error_plot_df.iloc[0:len(error_over_iters), :] = error_over_iters
        all_error_dfs.append(error_plot_df)

        print_policy(fm_qlearn.policy)

        # rewards = calc_reward(fm_qlearn.policy, R)
        # total_reward = np.sum(rewards)
        # all_rewards.append(total_reward)
        # if verbose: print("Forest Management QLearning Episode", episode, "reward:", total_reward, '\n')

    # filename = "tmp/fm_qlearn_stats.csv"
    # rewards_df = pd.DataFrame(all_rewards)
    # rewards_df.to_csv(filename)

    combined_error_df = pd.concat(all_error_dfs, axis=1)
    mean_error_per_iter = combined_error_df.mean(axis=1)
    mean_error_per_iter.to_csv("tmp/fm_qlearn_error.csv")

    # plot the error over iterations
    title = "FM QL: error vs. iter (mean over " + str(
        num_episodes) + " episodes)"
    path = "graphs/fm_ql_error_iter.png"
    plotting.plot_error_over_iters(mean_error_per_iter, title, path)

    # show avg time per run
    avg_time_per_run = np.mean(np.array(time_per_run))
    print("FM QL - avg seconds per run:", avg_time_per_run, '\n')
Exemple #23
0
# avg V, n_iter, time
alpha_vals = [.1, .3, .5, .7, .9]
epslion_vals = [.2, .4, .6, .8]

big_vs = []
big_n = []
big_t = []
for epsilon in epslion_vals:
    avg_vs = []
    n_iters = []
    times = []
    for alpha in alpha_vals:
        q = mdp.QLearning(P_small,
                          R_small,
                          gamma=.9999,
                          alpha=alpha,
                          alpha_decay=1,
                          epsilon=epsilon,
                          epsilon_decay=.99)
        stats = q.run()

        avg_v = stats[-1]['Mean V']
        n_iter = len(stats)
        time = stats[-1]['Time']

        avg_vs.append(avg_v)
        n_iters.append(n_iter)
        times.append(time)

    big_vs.append(avg_vs)
    big_n.append(n_iters)
Exemple #24
0
def frozen_lake_all(P, R, gamma_range, mapping, shape):

    vi_iteration_list = np.zeros(gamma_range.shape)
    vi_time_list = np.zeros(gamma_range.shape)
    vi_reward_list = np.zeros(gamma_range.shape)
    vi_error_list = np.zeros(gamma_range.shape)

    pi_iteration_list = np.zeros(gamma_range.shape)
    pi_time_list = np.zeros(gamma_range.shape)
    pi_reward_list = np.zeros(gamma_range.shape)
    pi_error_list = np.zeros(gamma_range.shape)

    diff_list = np.zeros(gamma_range.shape)

    expected_policy = None

    for i, gamma in enumerate(gamma_range):
        print('Gamma %0.2f' % gamma)

        vi = mdp.ValueIteration(transitions=P,
                                reward=R,
                                gamma=gamma,
                                epsilon=0.0001,
                                max_iter=5000)
        # vi.setVerbose()
        vi.run()

        vi_iteration_list[i] = vi.run_stats[-1:][0]['Iteration']
        vi_time_list[i] = vi.run_stats[-1:][0]['Time']
        vi_reward_list[i] = vi.run_stats[-1:][0]['Reward']
        vi_error_list[i] = vi.run_stats[-1:][0]['Error']

        pi = mdp.PolicyIteration(transitions=P,
                                 reward=R,
                                 gamma=gamma,
                                 max_iter=5000,
                                 eval_type=1)
        # pi.setVerbose()
        pi.run()

        pi_iteration_list[i] = pi.run_stats[-1:][0]['Iteration']
        pi_time_list[i] = pi.run_stats[-1:][0]['Time']
        pi_reward_list[i] = pi.run_stats[-1:][0]['Reward']
        pi_error_list[i] = pi.run_stats[-1:][0]['Error']

        print('Value Iteration Policy Found: ' + str(vi.policy))
        print_policy(vi.policy, mapping, shape)
        print('Policy Iteration Policy Found: ' + str(pi.policy))
        print_policy(pi.policy, mapping, shape)

        difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)])
        diff_list[i] = difference1
        print('Discrepancy in Policy and Value Iteration: ', difference1)

        if difference1 == 0:
            expected_policy = vi.policy

        print()

        # Plotting
        # Error v Iteration
        plt.clf()
        plt.title('Value Iteration: Error v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Error')
        plt.plot(list(vi_iteration_list), list(vi_error_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_error_v_iteration.png')

        # Reward v Gamma
        plt.clf()
        plt.title('Value Iteration: Reward v Gamma')
        plt.xlabel('Gamma')
        plt.ylabel('Reward')
        plt.plot(list(gamma_range), list(vi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_reward_v_gamma.png')

        # Gamma v Iterations
        plt.clf()
        plt.title('Value Iteration: Gamma v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Gamma')
        plt.plot(list(vi_iteration_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_gamma_v_iterations.png')

        # Gamma v Time
        plt.clf()
        plt.title('Value Iteration: Gamma v Time')
        plt.xlabel('Time')
        plt.ylabel('Gamma')
        plt.plot(list(vi_time_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_gamma_v_time.png')

        # Reward vs Iterations
        plt.clf()
        plt.title('Value Iteration: Reward v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Reward')
        plt.plot(list(vi_iteration_list), list(vi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/vi_reward_v_iterations.png')

        # Policy
        # Error v Iteration
        plt.clf()
        plt.title('Policy Iteration: Error v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Error')
        plt.scatter(list(pi_iteration_list), list(pi_error_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_error_v_iteration.png')

        # Gamma v Reward
        plt.clf()
        plt.title('Policy Iteration: Reward v Gamma')
        plt.xlabel('Gamma')
        plt.ylabel('Reward')
        plt.scatter(list(gamma_range), list(pi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_reward_v_gamma.png')

        # Gamma v Iterations
        plt.clf()
        plt.title('Policy Iteration: Gamma v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Gamma')
        plt.scatter(list(pi_iteration_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_gamma_v_iterations.png')

        # Gamma v Time
        plt.clf()
        plt.title('Policy Iteration: Gamma v Time')
        plt.xlabel('Time')
        plt.ylabel('Gamma')
        plt.scatter(list(pi_time_list), list(gamma_range))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_gamma_v_time.png')

        # Reward vs Iterations
        plt.clf()
        plt.title('Policy Iteration: Reward v Iterations')
        plt.xlabel('Iterations')
        plt.ylabel('Reward')
        plt.scatter(list(pi_iteration_list), list(pi_reward_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/pi_reward_v_iterations.png')

        # Gamma vs Policy Differences
        plt.clf()
        plt.title('Gamma v Policy Differences')
        plt.xlabel('Gamma')
        plt.ylabel('Policy Differences')
        plt.scatter(list(gamma_range), list(diff_list))
        plt.tight_layout()
        plt.savefig('plots/frozen_lakes/gamma_v_differences.png')

    # TODO
    gamma_range = np.array([0.8, 0.9, 0.99])
    alpha_range = np.array([0.1, 0.9, 0.99])
    epsilon_range = np.array([0.1, 0.5, 0.9, 0.999])
    e_decay_range = np.array([0.1, 0.5, 0.9, 0.999])

    # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4))
    # epsilon_range = np.linspace(0.1, 1.0, 10)
    # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9))

    prev_Q = None
    thresh = 1e-4
    print('== Q Learning ==')
    for i, gamma in enumerate(gamma_range):
        for j, alpha in enumerate(alpha_range):
            for k, ep in enumerate(epsilon_range):
                for l, ed in enumerate(e_decay_range):
                    # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed))
                    ql = mdp.QLearning(transitions=P,
                                       reward=R,
                                       gamma=gamma,
                                       alpha=alpha,
                                       alpha_decay=1.0,
                                       alpha_min=0.001,
                                       epsilon=ep,
                                       epsilon_min=0.1,
                                       epsilon_decay=ed,
                                       n_iter=10e4)
                    stats = ql.run()
                    plot_stats(stats,
                               ('ql_frozen_lake_%0.2f_%0.2f_%0.2f_%0.2f' %
                                (gamma, alpha, ep, ed)))

                    # print('Policy: ')
                    # print(ql.policy)
                    # print(ql.run_stats)
                    df = pd.DataFrame.from_records(ql.run_stats)
                    iteration_list = df['Iteration'][-100:]
                    windowed_reward = df['Reward'][-100:].mean()
                    error_list = df['Error'][-100:].mean()

                    if prev_Q is None:
                        prev_Q = ql.Q
                    else:
                        variation = np.absolute(
                            np.subtract(np.asarray(ql.Q),
                                        np.asarray(prev_Q))).max()
                        res = np.abs(
                            np.subtract(np.asarray(prev_Q), np.asarray(ql.Q)))
                        print('Result: ')
                        print(res)
                        print('Variation: ')
                        print(variation)
                        print('Mean Reward for Last 100 Iterations:')
                        print(windowed_reward)
                        if np.all(
                                res < thresh
                        ) or variation < thresh or windowed_reward > 45.0:
                            print('Breaking! Below Thresh')
                            print(
                                'Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'
                                .format(gamma, alpha, ep, ed))
                            print('Optimal Policy: ')
                            print(ql.policy)
                            break