def run_frozenlake_solvers(): # Get frozen lake statistics small_lake = get_small_lake() large_lake = get_large_lake() small_lake_results = get_vipi_results(small_lake, max_iter=20000) large_lake_results = get_vipi_results(large_lake, max_iter=20000) for k, solver in small_lake_results.items(): solver_type, discount, _ = k.split("_") with open( os.path.join(frozenlake_dir, f"smalllake_{solver_type}_{discount}"), "wb") as f: pickle.dump(solver, f) for k, solver in large_lake_results.items(): solver_type, discount, _ = k.split("_") with open( os.path.join(frozenlake_dir, f"largelake_{solver_type}_{discount}"), "wb") as f: pickle.dump(solver, f)
def run_convergence_experiment(env_name, N=20000, stopper=1): best_explorer_lookup_table = { "hunter": EqualExploration(), "smalllake": EpsilonGreedyConstant(0.1), "largelake": EpsilonGreedyConstant(0.1), } env_lookup = { "hunter": HuntingMDPWrapper(create_small_hunting_environment()), "smalllake": RewardShapedFrozenLake(get_small_lake()), "largelake": RewardShapedFrozenLake(get_large_lake()), } explorer = best_explorer_lookup_table[env_name] env = env_lookup[env_name] ql = QLearner(0.999, explorer, env) diffs = {} prev_Q = ql.q.copy() for i in tqdm(range(N // stopper)): ql.run_x_episodes(stopper) diff = abs(ql.q - prev_Q).max() prev_Q = ql.q.copy() diffs[i] = diff outfile = f'{env_name}_convergence.pkl' outpath = os.path.join('output', 'qlearning_convergence', outfile) with open(outpath, 'wb') as f: pickle.dump(diffs, f) return diffs
def run_best_qlearner_vs_optimal_experiment(env): best_q_lookup = { 'hunter': 'output/qlearning/hunter/equal', 'smalllake': 'output/qlearning/shaped_smalllake/epsilonstatedecay', 'largelake': 'output/qlearning/shaped_largelake/epsilonstatedecay' } env_lookup = {'smalllake': get_small_lake(), 'largelake': get_large_lake()} best_vi_lookup = { 'hunter': 'output/hunterschoice/smallhunter_vi_0.999', 'smalllake': 'output/frozenlake/smalllake_vi_0.999', 'largelake': 'output/frozenlake/largelake_vi_0.999' } best_vi = best_vi_lookup[env] with open(best_vi, 'rb') as f: solver = pickle.load(f) indir = best_q_lookup[env] qlearners = get_sorted_qlearner_filepaths(indir) # Use last qlearner best = qlearners[-1] with open(os.path.join(indir, best), 'rb') as f: qt = pickle.load(f) def optimal_policy(state): return solver.policy[state] def learned_policy(state): return qt.argmax(axis=1)[state] if env == 'smalllake': optimal_results = simulate_lake_policy_from_func(optimal_policy, size='small', N=10000) learned_results = simulate_lake_policy_from_func(learned_policy, size='small', N=10000) elif env == 'largelake': optimal_results = simulate_lake_policy_from_func(optimal_policy, size='large', N=10000) learned_results = simulate_lake_policy_from_func(learned_policy, size='large', N=10000) elif env == 'hunter': optimal_results = simulate_hunterschoice_policy_from_func( optimal_policy, N=10000) learned_results = simulate_hunterschoice_policy_from_func( learned_policy, N=10000) outdir = os.path.join('output', 'final_evaluation') with open(os.path.join(outdir, f'{env}_optimal.pkl'), 'wb') as f: pickle.dump(optimal_results, f) with open(os.path.join(outdir, f'{env}_learned.pkl'), 'wb') as f: pickle.dump(learned_results, f)
def render_frozenlake_policies(title, discount_rate, size): _, small_policies = get_small_lake_stats_policies() _, large_policies = get_large_lake_stats_policies() small_lake = get_small_lake() large_lake = get_large_lake() if size == "small": vi_policy = [ solver["policy"] for solver in small_policies["vi"] if float(solver["discount"]) == discount_rate ][0] vi_values = [ solver["values"] for solver in small_policies["vi"] if float(solver["discount"]) == discount_rate ][0] pi_policy = [ solver["policy"] for solver in small_policies["pi"] if float(solver["discount"]) == discount_rate ][0] render_frozenlake_policy( f"Value Iteration {title} (Small)", small_lake.m, vi_policy, values=vi_values, ) render_frozenlake_policy(f"Policy Iteration {title} (Small)", small_lake.m, pi_policy) else: vi_policy = [ solver["policy"] for solver in large_policies["vi"] if float(solver["discount"]) == discount_rate ][0] vi_values = [ solver["values"] for solver in large_policies["vi"] if float(solver["discount"]) == discount_rate ][0] pi_policy = [ solver["policy"] for solver in large_policies["pi"] if float(solver["discount"]) == discount_rate ][0] render_frozenlake_policy( f"Value Iteration {title} (Large)", large_lake.m, vi_policy, values=vi_values, ) render_frozenlake_policy(f"Policy Iteration {title} (Large)", large_lake.m, pi_policy)
def simulate_lake_policy_from_func(policy, N=1000, size="small"): if size == "small": env = get_small_lake() else: env = get_large_lake() agent = Agent(env, policy) successes = 0 episode_lengths = [] for i in range(N): rewards = run_agent(agent) if rewards[-1] == 1: successes += 1 episode_lengths.append(len(rewards)) return successes / N, episode_lengths
def simulate_frozenlake_policy(policy_location, N=1000): print(f"Simulating Policy {policy_location}") with open(policy_location, "rb") as f: solver = pickle.load(f) if "small" in policy_location: lake = get_small_lake() else: lake = get_large_lake() policy = create_frozenlake_policy(solver.policy) agent = Agent(lake, policy) successes = 0 episode_lengths = [] for i in range(N): rewards = run_agent(agent) if rewards[-1] == 1: successes += 1 episode_lengths.append(len(rewards)) return successes / N, episode_lengths
def run_all_qlearners( discount=0.999, save_episode_rate=500, max_episodes=20000, envs=[ "hunter", "smalllake", "largelake", "shaped_smalllake", "shaped_largelake" ], ): hunter_raw = create_small_hunting_environment() hunter = HuntingMDPWrapper(hunter_raw) small_lake = get_small_lake() large_lake = get_large_lake() shaped_smallake = RewardShapedFrozenLake(small_lake) shaped_largelake = RewardShapedFrozenLake(large_lake) randomexploration = RandomExploration() equalexploration = EqualExploration() greedyexploration = GreedyExploration() epsilongreedy = EpsilonGreedyConstant(0.1) epsilondecay = EpsilonGreedyDecay(decay_rate=0.00001) epsilonstatedecay = EpsilonGreedyStateBasedDecay() env_map = { "hunter": hunter, "smalllake": small_lake, "largeuake": shaped_largelake, "shaped_smalllake": shaped_smallake, "shaped_largelake": shaped_largelake, } for sub_dir in envs: env = env_map[sub_dir] root_dir = os.path.join(output_dir, "qlearning", sub_dir) for exploration_strategy, explore_dir in zip( [ randomexploration, equalexploration, greedyexploration, epsilongreedy, epsilondecay, epsilonstatedecay, ], [ "random", "equal", "greedy", "epsilongreedy", "epsilondecay", "epsilonstatedecay", ], ): print(f"Collecting Data For {sub_dir} with strategy {explore_dir}") save_dir = os.path.join(root_dir, explore_dir) collect_qlearner_data( env, exploration_strategy, save_dir, save_episode_rate=save_episode_rate, max_episodes=max_episodes, discount_rate=discount, )