コード例 #1
0
ファイル: experiments.py プロジェクト: mattdeak/mdp-analysis
def run_frozenlake_solvers():
    # Get frozen lake statistics
    small_lake = get_small_lake()
    large_lake = get_large_lake()

    small_lake_results = get_vipi_results(small_lake, max_iter=20000)
    large_lake_results = get_vipi_results(large_lake, max_iter=20000)

    for k, solver in small_lake_results.items():
        solver_type, discount, _ = k.split("_")

        with open(
                os.path.join(frozenlake_dir,
                             f"smalllake_{solver_type}_{discount}"),
                "wb") as f:
            pickle.dump(solver, f)

    for k, solver in large_lake_results.items():
        solver_type, discount, _ = k.split("_")

        with open(
                os.path.join(frozenlake_dir,
                             f"largelake_{solver_type}_{discount}"),
                "wb") as f:
            pickle.dump(solver, f)
コード例 #2
0
ファイル: experiments.py プロジェクト: mattdeak/mdp-analysis
def run_convergence_experiment(env_name, N=20000, stopper=1):
    best_explorer_lookup_table = {
        "hunter": EqualExploration(),
        "smalllake": EpsilonGreedyConstant(0.1),
        "largelake": EpsilonGreedyConstant(0.1),
    }

    env_lookup = {
        "hunter": HuntingMDPWrapper(create_small_hunting_environment()),
        "smalllake": RewardShapedFrozenLake(get_small_lake()),
        "largelake": RewardShapedFrozenLake(get_large_lake()),
    }

    explorer = best_explorer_lookup_table[env_name]
    env = env_lookup[env_name]

    ql = QLearner(0.999, explorer, env)
    diffs = {}
    prev_Q = ql.q.copy()
    for i in tqdm(range(N // stopper)):
        ql.run_x_episodes(stopper)
        diff = abs(ql.q - prev_Q).max()
        prev_Q = ql.q.copy()
        diffs[i] = diff

    outfile = f'{env_name}_convergence.pkl'
    outpath = os.path.join('output', 'qlearning_convergence', outfile)
    with open(outpath, 'wb') as f:
        pickle.dump(diffs, f)

    return diffs
コード例 #3
0
ファイル: experiments.py プロジェクト: mattdeak/mdp-analysis
def run_best_qlearner_vs_optimal_experiment(env):
    best_q_lookup = {
        'hunter': 'output/qlearning/hunter/equal',
        'smalllake': 'output/qlearning/shaped_smalllake/epsilonstatedecay',
        'largelake': 'output/qlearning/shaped_largelake/epsilonstatedecay'
    }
    env_lookup = {'smalllake': get_small_lake(), 'largelake': get_large_lake()}

    best_vi_lookup = {
        'hunter': 'output/hunterschoice/smallhunter_vi_0.999',
        'smalllake': 'output/frozenlake/smalllake_vi_0.999',
        'largelake': 'output/frozenlake/largelake_vi_0.999'
    }

    best_vi = best_vi_lookup[env]
    with open(best_vi, 'rb') as f:
        solver = pickle.load(f)

    indir = best_q_lookup[env]
    qlearners = get_sorted_qlearner_filepaths(indir)
    # Use last qlearner
    best = qlearners[-1]
    with open(os.path.join(indir, best), 'rb') as f:
        qt = pickle.load(f)

    def optimal_policy(state):
        return solver.policy[state]

    def learned_policy(state):
        return qt.argmax(axis=1)[state]

    if env == 'smalllake':
        optimal_results = simulate_lake_policy_from_func(optimal_policy,
                                                         size='small',
                                                         N=10000)
        learned_results = simulate_lake_policy_from_func(learned_policy,
                                                         size='small',
                                                         N=10000)
    elif env == 'largelake':
        optimal_results = simulate_lake_policy_from_func(optimal_policy,
                                                         size='large',
                                                         N=10000)
        learned_results = simulate_lake_policy_from_func(learned_policy,
                                                         size='large',
                                                         N=10000)
    elif env == 'hunter':
        optimal_results = simulate_hunterschoice_policy_from_func(
            optimal_policy, N=10000)
        learned_results = simulate_hunterschoice_policy_from_func(
            learned_policy, N=10000)

    outdir = os.path.join('output', 'final_evaluation')

    with open(os.path.join(outdir, f'{env}_optimal.pkl'), 'wb') as f:
        pickle.dump(optimal_results, f)

    with open(os.path.join(outdir, f'{env}_learned.pkl'), 'wb') as f:
        pickle.dump(learned_results, f)
コード例 #4
0
ファイル: plotting.py プロジェクト: mattdeak/mdp-analysis
def render_frozenlake_policies(title, discount_rate, size):
    _, small_policies = get_small_lake_stats_policies()
    _, large_policies = get_large_lake_stats_policies()
    small_lake = get_small_lake()
    large_lake = get_large_lake()
    if size == "small":
        vi_policy = [
            solver["policy"] for solver in small_policies["vi"]
            if float(solver["discount"]) == discount_rate
        ][0]
        vi_values = [
            solver["values"] for solver in small_policies["vi"]
            if float(solver["discount"]) == discount_rate
        ][0]
        pi_policy = [
            solver["policy"] for solver in small_policies["pi"]
            if float(solver["discount"]) == discount_rate
        ][0]

        render_frozenlake_policy(
            f"Value Iteration {title} (Small)",
            small_lake.m,
            vi_policy,
            values=vi_values,
        )
        render_frozenlake_policy(f"Policy Iteration {title} (Small)",
                                 small_lake.m, pi_policy)
    else:
        vi_policy = [
            solver["policy"] for solver in large_policies["vi"]
            if float(solver["discount"]) == discount_rate
        ][0]
        vi_values = [
            solver["values"] for solver in large_policies["vi"]
            if float(solver["discount"]) == discount_rate
        ][0]
        pi_policy = [
            solver["policy"] for solver in large_policies["pi"]
            if float(solver["discount"]) == discount_rate
        ][0]

        render_frozenlake_policy(
            f"Value Iteration {title} (Large)",
            large_lake.m,
            vi_policy,
            values=vi_values,
        )
        render_frozenlake_policy(f"Policy Iteration {title} (Large)",
                                 large_lake.m, pi_policy)
コード例 #5
0
ファイル: experiments.py プロジェクト: mattdeak/mdp-analysis
def simulate_lake_policy_from_func(policy, N=1000, size="small"):
    if size == "small":
        env = get_small_lake()
    else:
        env = get_large_lake()
    agent = Agent(env, policy)

    successes = 0
    episode_lengths = []
    for i in range(N):
        rewards = run_agent(agent)
        if rewards[-1] == 1:
            successes += 1
        episode_lengths.append(len(rewards))

    return successes / N, episode_lengths
コード例 #6
0
ファイル: experiments.py プロジェクト: mattdeak/mdp-analysis
def simulate_frozenlake_policy(policy_location, N=1000):
    print(f"Simulating Policy {policy_location}")
    with open(policy_location, "rb") as f:
        solver = pickle.load(f)

    if "small" in policy_location:
        lake = get_small_lake()
    else:
        lake = get_large_lake()

    policy = create_frozenlake_policy(solver.policy)
    agent = Agent(lake, policy)

    successes = 0
    episode_lengths = []
    for i in range(N):
        rewards = run_agent(agent)
        if rewards[-1] == 1:
            successes += 1
        episode_lengths.append(len(rewards))

    return successes / N, episode_lengths
コード例 #7
0
ファイル: experiments.py プロジェクト: mattdeak/mdp-analysis
def run_all_qlearners(
    discount=0.999,
    save_episode_rate=500,
    max_episodes=20000,
    envs=[
        "hunter", "smalllake", "largelake", "shaped_smalllake",
        "shaped_largelake"
    ],
):

    hunter_raw = create_small_hunting_environment()
    hunter = HuntingMDPWrapper(hunter_raw)
    small_lake = get_small_lake()
    large_lake = get_large_lake()

    shaped_smallake = RewardShapedFrozenLake(small_lake)
    shaped_largelake = RewardShapedFrozenLake(large_lake)

    randomexploration = RandomExploration()
    equalexploration = EqualExploration()
    greedyexploration = GreedyExploration()
    epsilongreedy = EpsilonGreedyConstant(0.1)
    epsilondecay = EpsilonGreedyDecay(decay_rate=0.00001)
    epsilonstatedecay = EpsilonGreedyStateBasedDecay()

    env_map = {
        "hunter": hunter,
        "smalllake": small_lake,
        "largeuake": shaped_largelake,
        "shaped_smalllake": shaped_smallake,
        "shaped_largelake": shaped_largelake,
    }

    for sub_dir in envs:
        env = env_map[sub_dir]
        root_dir = os.path.join(output_dir, "qlearning", sub_dir)
        for exploration_strategy, explore_dir in zip(
            [
                randomexploration,
                equalexploration,
                greedyexploration,
                epsilongreedy,
                epsilondecay,
                epsilonstatedecay,
            ],
            [
                "random",
                "equal",
                "greedy",
                "epsilongreedy",
                "epsilondecay",
                "epsilonstatedecay",
            ],
        ):
            print(f"Collecting Data For {sub_dir} with strategy {explore_dir}")

            save_dir = os.path.join(root_dir, explore_dir)
            collect_qlearner_data(
                env,
                exploration_strategy,
                save_dir,
                save_episode_rate=save_episode_rate,
                max_episodes=max_episodes,
                discount_rate=discount,
            )