def test_backward_induction_sd(horizon, S, A): """ Test stage-dependent MDPs """ for sim in range(5): # generate random MDP Rstat, Pstat = get_random_mdp(S, A) R = np.zeros((horizon, S, A)) P = np.zeros((horizon, S, A, S)) for ii in range(horizon): R[ii, :, :] = Rstat P[ii, :, :, :] = Pstat # run backward induction in stationary MDP Qstat, Vstat = backward_induction(Rstat, Pstat, horizon) # run backward induction in stage-dependent MDP Q = np.zeros((horizon, S, A)) V = np.zeros((horizon, S)) backward_induction_sd(Q, V, R, P) # run backward induction with stage-dependent rewards Q2 = np.zeros((horizon, S, A)) V2 = np.zeros((horizon, S)) backward_induction_reward_sd(Q2, V2, R, Pstat) assert np.array_equal(Q, Qstat) assert np.array_equal(V, Vstat) assert np.array_equal(Q2, Qstat) assert np.array_equal(V2, Vstat)
def test_backward_induction(horizon, S, A): for sim in range(5): # generate random MDP R, P = get_random_mdp(S, A) # run backward induction Q, V = backward_induction(R, P, horizon) assert Q.max() <= horizon assert V.max() <= horizon # run backward with clipping V to 1.0 Q, V = backward_induction(R, P, horizon, vmax=1.0) assert V.max() <= 1.0 # run bacward induction in place Q2 = np.zeros((horizon, S, A)) V2 = np.zeros((horizon, S)) backward_induction_in_place(Q2, V2, R, P, horizon, vmax=1.0) assert np.array_equal(Q, Q2) assert np.array_equal(V, V2)
def fit(self, budget: int, **kwargs): del kwargs for _ in range(budget): self._run_episode() # compute Q function for the recommended policy self.Q_policy, _ = backward_induction( self.R_hat[:self.M, :], self.P_hat[:self.M, :, :self.M], self.horizon, self.gamma, )
def fit(self, budget=None, **kwargs): """Build empirical MDP and run value iteration.""" del kwargs S = self.env.observation_space.n A = self.env.action_space.n self.N_sa = np.zeros((S, A)) self.N_sas = np.zeros((S, A, S)) self.S_sa = np.zeros((S, A)) # collect data total_samples = S * A * self.n_samples count = 0 logger.debug( f"[{self.name}] collecting {self.n_samples} samples per (s,a)" f", total = {total_samples} samples.") for ss in range(S): for aa in range(A): for _ in range(self.n_samples): next_state, reward, _, _ = self.env.sample(ss, aa) self._update(ss, aa, next_state, reward) count += 1 if count % 10000 == 0: completed = 100 * count / total_samples logger.debug("[{}] ... {}/{} ({:0.0f}%)".format( self.name, count, total_samples, completed)) # build model and run VI logger.debug( f"{self.name} building model and running backward induction...") N_sa = np.maximum(self.N_sa, 1) self.R_hat = self.S_sa / N_sa self.P_hat = np.zeros((S, A, S)) for ss in range(S): self.P_hat[:, :, ss] = self.N_sas[:, :, ss] / N_sa info = {} info["n_samples"] = self.n_samples info["total_samples"] = total_samples if self.horizon is None: assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0" self.Q, self.V, n_it = value_iteration(self.R_hat, self.P_hat, self.gamma, self.epsilon) info["n_iterations"] = n_it info["precision"] = self.epsilon else: self.Q, self.V = backward_induction(self.R_hat, self.P_hat, self.horizon, self.gamma) info["n_iterations"] = self.horizon info["precision"] = 0.0 return info
def fit(self, **kwargs): info = {} self._rewards = np.zeros(self.n_episodes) self._cumul_rewards = np.zeros(self.n_episodes) for _ in range(self.n_episodes): self._run_episode() # compute Q function for the recommended policy self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :], self.P_hat[:self.M, :, :self.M], self.horizon, self.gamma) info["n_episodes"] = self.n_episodes info["episode_rewards"] = self._rewards return info
def fit(self, budget: int, **kwargs): del kwargs n_episodes_to_run = budget count = 0 while count < n_episodes_to_run: self._run_episode() count += 1 # compute Q function for the recommended policy self.Q_policy, _ = backward_induction( self.R_hat[:self.M, :], self.P_hat[:self.M, :, :self.M], self.horizon, self.gamma, )
def fit(self, **kwargs): """ Run value iteration. """ info = {} if self.horizon is None: assert self.gamma < 1.0, \ "The discounted setting requires gamma < 1.0" self.Q, self.V, n_it = value_iteration(self.env.R, self.env.P, self.gamma, self.epsilon) info["n_iterations"] = n_it info["precision"] = self.epsilon else: self.Q, self.V = backward_induction(self.env.R, self.env.P, self.horizon, self.gamma) info["n_iterations"] = self.horizon info["precision"] = 0.0 return info
def partial_fit(self, fraction, **kwargs): assert 0.0 < fraction <= 1.0 n_episodes_to_run = int(np.ceil(fraction * self.n_episodes)) count = 0 while count < n_episodes_to_run and self.episode < self.n_episodes: self._run_episode() count += 1 # compute Q function for the recommended policy self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :], self.P_hat[:self.M, :, :self.M], self.horizon, self.gamma) info = { "n_episodes": self.episode, "episode_rewards": self._rewards[:self.episode] } return info