def __init__(self, exp, idx): super().__init__(exp, idx) self.exp = exp self.idx = idx self.states = self.params['states'] mu_pl = self.params['behavior'] self.behavior = Policy(lambda s: [mu_pl, 1 - mu_pl]) pi_pl = self.params['target'] self.target = Policy(lambda s: [pi_pl, 1 - pi_pl]) self.env = RWEnv(self.states) # build representation representation = self.params['representation'] self.rep = self._buildRepresentation(representation) # build agent self.agent = self.Agent(self.rep.features(), 2, self.params)
def __init__(self, exp, idx): super().__init__(exp, idx) self.exp = exp self.idx = idx N = self._getSize() self.reward_scale = self.metaParameters.get('reward_scale', 1) self.env = Chain(N, self.reward_scale) # build target policy self.target = self._getTarget() self.behavior = Policy(lambda s: [0.5, 0.5]) self.v_star = self.compute_v(N, self.target, self.reward_scale) # build representation self.rep = self._getRepresentation(N) # build agent self.agent = self.Agent(self.rep.features(), self.metaParameters) # compute the observable value for each state once self.X = np.array([self.rep.encode(i) for i in range(N + 1)]) # (1/n+1) sum_{k=0}^n P^k gives a matrix with db in each row, where P is the markov chain # induced by the behaviour policy self.db = self._getdb() # build transition probability matrix (under target) self.P = np.zeros((N + 1, N + 1)) pl, pr = self.target.probs(0) self.P[0, 1] = pr self.P[0, N] = pl self.P[N - 1, N - 2] = pl self.P[N - 1, N] = pr for i in range(1, N - 1): self.P[i, i - 1] = pl self.P[i, i + 1] = pr self.R = np.zeros(N + 1) self.R[0] = pl * -self.reward_scale self.R[N - 1] = pr * self.reward_scale self.setupIdealH()
def _getTarget(self): return Policy(lambda s: [.1, .9])
def _getTarget(self): return Policy(lambda s: [.25, .75])
def _getTarget(self): return Policy(lambda s: [.4, .6])