コード例 #1
0
    def policy_walk(self):
        random_rewards = self.sample_random_rewards(self.num_states, 1, 1)
        env = FrozenLakeEnv(is_slippery=True, rewards=random_rewards)
        env.num_actions = env.nA
        env.num_states = env.nS
        o = env.reset()
        dp = DP(env)
        # for _ in range(1):
        #     dp.policy_eval()
        #     dp.policy_imp()
        dp.policy_iter()

        dp.q_values = np.array([dp.q_values[s] for s in dp.q_values])
        pi = dp.policy
        # plt.figure(figsize=(8, 8),num="pi")
        # sns.heatmap(dp.policy.reshape(16, 4),
        #             cmap="Spectral", annot=True, cbar=False)
        # plt.show()
        for _ in range(200):
            random_rewards = env.rewards
            new_rewards = self.mcmc_reward_step(random_rewards,
                                                step_size=0.5,
                                                r_max=1)
            # new_rewards = self.sample_random_rewards(self.num_states, 1, 1)
            env_new = FrozenLakeEnv(is_slippery=True, rewards=new_rewards)
            env_new.num_actions = env_new.nA
            env_new.num_states = env_new.nS
            # o = env_new.reset()
            dp_new = DP(env_new)
            # print(dp_new.policy)
            # dp_new = DP(env_new)
            # plt.figure(figsize=(8, 8),num="pi before imp")
            # sns.heatmap(dp.policy.reshape(16, 4),
            #         cmap="Spectral", annot=True, cbar=False)

            dp_new.policy_iter()

            # dp_new.policy_imp()
            dp_new_q_values = np.array(
                [dp_new.q_values[s] for s in dp_new.q_values])
            dp_new = DP(env_new)
            dp_new.policy = pi
            dp_new.q_values = dp_new_q_values

            # plt.figure(figsize=(8, 8),num="pi after imp")
            # sns.heatmap(dp.policy.reshape(16, 4),
            #         cmap="Spectral", annot=True, cbar=False)
            # plt.figure(figsize=(8, 8),num="new q's")

            # sns.heatmap(dp_new.q_values.reshape(16, 4),
            #         cmap="Spectral", annot=True, cbar=False)
            # plt.show()
            """
            if "dp_q_values < dp_new_q_values":
                    or
            if "dp_new_q_values(pi) < dp_new_q_values" (with this for now):
            
            """
            if self.optimal_q_check(dp_new.q_values, pi):
                dp_new.policy_iter()
                pi_new = dp_new.policy
                """
                prob_comparision = update env(rews) policy with prob ( min(1, ratio(posterioirs of dp,dp_new's policies)))
                """
                # if posteriors_ratio(env_new,pi_new,env,pi,prior,)
                if np.random.random() < self.posteriors_ratio(dp, dp_new):
                    print("update env and pi")

                    # "porb comparision":
                    env, pi = env_new, pi_new
            else:
                if np.random.random() < self.posteriors_ratio(dp, dp_new):
                    # if "prob comparision":
                    print("update env")

                    env = env_new

            # break
        self.rewards_recovered = env.rewards