def policy_walk(self): random_rewards = self.sample_random_rewards(self.num_states, 1, 1) env = FrozenLakeEnv(is_slippery=True, rewards=random_rewards) env.num_actions = env.nA env.num_states = env.nS o = env.reset() dp = DP(env) # for _ in range(1): # dp.policy_eval() # dp.policy_imp() dp.policy_iter() dp.q_values = np.array([dp.q_values[s] for s in dp.q_values]) pi = dp.policy # plt.figure(figsize=(8, 8),num="pi") # sns.heatmap(dp.policy.reshape(16, 4), # cmap="Spectral", annot=True, cbar=False) # plt.show() for _ in range(200): random_rewards = env.rewards new_rewards = self.mcmc_reward_step(random_rewards, step_size=0.5, r_max=1) # new_rewards = self.sample_random_rewards(self.num_states, 1, 1) env_new = FrozenLakeEnv(is_slippery=True, rewards=new_rewards) env_new.num_actions = env_new.nA env_new.num_states = env_new.nS # o = env_new.reset() dp_new = DP(env_new) # print(dp_new.policy) # dp_new = DP(env_new) # plt.figure(figsize=(8, 8),num="pi before imp") # sns.heatmap(dp.policy.reshape(16, 4), # cmap="Spectral", annot=True, cbar=False) dp_new.policy_iter() # dp_new.policy_imp() dp_new_q_values = np.array( [dp_new.q_values[s] for s in dp_new.q_values]) dp_new = DP(env_new) dp_new.policy = pi dp_new.q_values = dp_new_q_values # plt.figure(figsize=(8, 8),num="pi after imp") # sns.heatmap(dp.policy.reshape(16, 4), # cmap="Spectral", annot=True, cbar=False) # plt.figure(figsize=(8, 8),num="new q's") # sns.heatmap(dp_new.q_values.reshape(16, 4), # cmap="Spectral", annot=True, cbar=False) # plt.show() """ if "dp_q_values < dp_new_q_values": or if "dp_new_q_values(pi) < dp_new_q_values" (with this for now): """ if self.optimal_q_check(dp_new.q_values, pi): dp_new.policy_iter() pi_new = dp_new.policy """ prob_comparision = update env(rews) policy with prob ( min(1, ratio(posterioirs of dp,dp_new's policies))) """ # if posteriors_ratio(env_new,pi_new,env,pi,prior,) if np.random.random() < self.posteriors_ratio(dp, dp_new): print("update env and pi") # "porb comparision": env, pi = env_new, pi_new else: if np.random.random() < self.posteriors_ratio(dp, dp_new): # if "prob comparision": print("update env") env = env_new # break self.rewards_recovered = env.rewards
sleep(1) env.close() return sim_store if __name__ == "__main__": env = FrozenLakeEnv(is_slippery=True) env.num_actions = env.nA env.num_states = env.nS o = env.reset() dp = DP(env) for _ in range(100): dp.policy_eval() dp.policy_imp() dp.q_values = np.array([dp.q_values[s] for s in dp.q_values]) # exit() # plt.figure(figsize=(8, 8), num="dp_sv") # sns.heatmap(dp.state_values.reshape(4, 4), # cmap="Spectral", annot=True, cbar=False) # plt.figure(figsize=(8, 8), num="dp_q") # sns.heatmap(dp.q_values.reshape(16, 4), # cmap="Spectral", annot=True, cbar=False) # plt.show() # plt.show() birl = Birl(env.num_states) print("Running Sim") birl.sim_store = birl.sim(dp) print("Running Sim Done") birl.policy_walk() rewards_implicit = np.array([