Esempio n. 1
0
def test_backward_induction_sd(horizon, S, A):
    """
    Test stage-dependent MDPs
    """
    for sim in range(5):
        # generate random MDP
        Rstat, Pstat = get_random_mdp(S, A)
        R = np.zeros((horizon, S, A))
        P = np.zeros((horizon, S, A, S))
        for ii in range(horizon):
            R[ii, :, :] = Rstat
            P[ii, :, :, :] = Pstat

        # run backward induction in stationary MDP
        Qstat, Vstat = backward_induction(Rstat, Pstat, horizon)

        # run backward induction in stage-dependent MDP
        Q = np.zeros((horizon, S, A))
        V = np.zeros((horizon, S))
        backward_induction_sd(Q, V, R, P)

        # run backward induction with stage-dependent rewards
        Q2 = np.zeros((horizon, S, A))
        V2 = np.zeros((horizon, S))
        backward_induction_reward_sd(Q2, V2, R, Pstat)

        assert np.array_equal(Q, Qstat)
        assert np.array_equal(V, Vstat)
        assert np.array_equal(Q2, Qstat)
        assert np.array_equal(V2, Vstat)
Esempio n. 2
0
def test_backward_induction(horizon, S, A):
    for sim in range(5):
        # generate random MDP
        R, P = get_random_mdp(S, A)

        # run backward induction
        Q, V = backward_induction(R, P, horizon)

        assert Q.max() <= horizon
        assert V.max() <= horizon

        # run backward with clipping V to 1.0
        Q, V = backward_induction(R, P, horizon, vmax=1.0)
        assert V.max() <= 1.0

        # run bacward induction in place
        Q2 = np.zeros((horizon, S, A))
        V2 = np.zeros((horizon, S))
        backward_induction_in_place(Q2, V2, R, P, horizon, vmax=1.0)
        assert np.array_equal(Q, Q2)
        assert np.array_equal(V, V2)
Esempio n. 3
0
    def fit(self, budget: int, **kwargs):
        del kwargs
        for _ in range(budget):
            self._run_episode()

        # compute Q function for the recommended policy
        self.Q_policy, _ = backward_induction(
            self.R_hat[:self.M, :],
            self.P_hat[:self.M, :, :self.M],
            self.horizon,
            self.gamma,
        )
Esempio n. 4
0
    def fit(self, budget=None, **kwargs):
        """Build empirical MDP and run value iteration."""
        del kwargs
        S = self.env.observation_space.n
        A = self.env.action_space.n
        self.N_sa = np.zeros((S, A))
        self.N_sas = np.zeros((S, A, S))
        self.S_sa = np.zeros((S, A))

        # collect data
        total_samples = S * A * self.n_samples
        count = 0
        logger.debug(
            f"[{self.name}] collecting {self.n_samples} samples per (s,a)"
            f", total = {total_samples} samples.")
        for ss in range(S):
            for aa in range(A):
                for _ in range(self.n_samples):
                    next_state, reward, _, _ = self.env.sample(ss, aa)
                    self._update(ss, aa, next_state, reward)

                    count += 1
                    if count % 10000 == 0:
                        completed = 100 * count / total_samples
                        logger.debug("[{}] ... {}/{} ({:0.0f}%)".format(
                            self.name, count, total_samples, completed))

        # build model and run VI
        logger.debug(
            f"{self.name} building model and running backward induction...")

        N_sa = np.maximum(self.N_sa, 1)
        self.R_hat = self.S_sa / N_sa
        self.P_hat = np.zeros((S, A, S))
        for ss in range(S):
            self.P_hat[:, :, ss] = self.N_sas[:, :, ss] / N_sa

        info = {}
        info["n_samples"] = self.n_samples
        info["total_samples"] = total_samples
        if self.horizon is None:
            assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
            self.Q, self.V, n_it = value_iteration(self.R_hat, self.P_hat,
                                                   self.gamma, self.epsilon)
            info["n_iterations"] = n_it
            info["precision"] = self.epsilon
        else:
            self.Q, self.V = backward_induction(self.R_hat, self.P_hat,
                                                self.horizon, self.gamma)
            info["n_iterations"] = self.horizon
            info["precision"] = 0.0
        return info
Esempio n. 5
0
    def fit(self, **kwargs):
        info = {}
        self._rewards = np.zeros(self.n_episodes)
        self._cumul_rewards = np.zeros(self.n_episodes)
        for _ in range(self.n_episodes):
            self._run_episode()

        # compute Q function for the recommended policy
        self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :],
                                              self.P_hat[:self.M, :, :self.M],
                                              self.horizon, self.gamma)

        info["n_episodes"] = self.n_episodes
        info["episode_rewards"] = self._rewards
        return info
Esempio n. 6
0
    def fit(self, budget: int, **kwargs):
        del kwargs
        n_episodes_to_run = budget
        count = 0
        while count < n_episodes_to_run:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        self.Q_policy, _ = backward_induction(
            self.R_hat[:self.M, :],
            self.P_hat[:self.M, :, :self.M],
            self.horizon,
            self.gamma,
        )
Esempio n. 7
0
 def fit(self, **kwargs):
     """
     Run value iteration.
     """
     info = {}
     if self.horizon is None:
         assert self.gamma < 1.0, \
             "The discounted setting requires gamma < 1.0"
         self.Q, self.V, n_it = value_iteration(self.env.R, self.env.P,
                                                self.gamma, self.epsilon)
         info["n_iterations"] = n_it
         info["precision"] = self.epsilon
     else:
         self.Q, self.V = backward_induction(self.env.R, self.env.P,
                                             self.horizon, self.gamma)
         info["n_iterations"] = self.horizon
         info["precision"] = 0.0
     return info
Esempio n. 8
0
    def partial_fit(self, fraction, **kwargs):
        assert 0.0 < fraction <= 1.0
        n_episodes_to_run = int(np.ceil(fraction * self.n_episodes))
        count = 0
        while count < n_episodes_to_run and self.episode < self.n_episodes:
            self._run_episode()
            count += 1

        # compute Q function for the recommended policy
        self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :],
                                              self.P_hat[:self.M, :, :self.M],
                                              self.horizon, self.gamma)

        info = {
            "n_episodes": self.episode,
            "episode_rewards": self._rewards[:self.episode]
        }
        return info