Feature representations used in TODO: cite <Sutton et al. 2009> """ def invertedFeatures(n: int): # additive inverse of tabular (hence name) m = 1 - tabularFeatures(n) return _normRows(m) @try2jit def dependentFeatures(n: int): nfeats = int(np.floor(n / 2) + 1) m = np.zeros((n, nfeats)) idx = 0 for i in range(nfeats): m[idx, 0: i + 1] = 1 idx += 1 for i in range(nfeats - 1, 0, -1): m[idx, -i:] = 1 idx += 1 return _normRows(m) def tabularFeatures(n: int): return np.eye(n) addToCategory('ope', RandomWalk) addToCategory('random-walk', RandomWalk) addToCategory('finite-dynamics', RandomWalk)
class BECounterexample(FiniteDynamics): num_states = 3 num_actions = 2 K = immutable(_buildTransitionKernel()) Rs = immutable(_buildRewardKernel()) T = immutable(np.zeros((3, 2, 3))) d0 = immutable(np.array([1., 0, 0])) # some utility functions to encode other important parts of the problem spec # not necessarily environment specific, but this is as good a place as any to store them def behaviorPolicy(s: int): return np.array([0.5, 0.5]) def representationMatrix(): return np.array([ [1., 0], [0, 1], [0, 1], ]) addToCategory('ope', BECounterexample) addToCategory('ope-counterexample', BECounterexample) addToCategory('finite-dynamics', BECounterexample)
def start(self): # perturb the start state multiplicatively by a normal random amount for each component of system eps = 1 if self.random_start > 0: eps = self.start_rng.normal(1, self.random_start, size=6) # start in non-healthy stable state start = np.array([163573, 5., 11945, 46, 63919, 24]) * eps self._state = start return _transform(start) def step(self, action: int): sp = _nextState(self._state, action, self.dt, self.actionEffects) r = HIVTreatment.reward(self._state, action, sp) t = HIVTreatment.terminal(self._state, action, sp) self._state = sp return (r, _transform(sp), t) def setState(self, state: np.ndarray): self._state = state.copy() def copy(self): m = HIVTreatment(self._seed) m._state = self._state.copy() return m addToCategory('classic-control', HIVTreatment)
# Split top/bottom left WallState((0, 5)), WallState((2, 5)), WallState((3, 5)), WallState((4, 5)), WallState((5, 5)), # Split top left/right WallState((5, 10)), WallState((5, 9)), WallState((5, 7)), WallState((5, 6)), # Split bottom left/right WallState((5, 4)), WallState((5, 3)), WallState((5, 2)), WallState((5, 0)), # Split top/bottom right WallState((6, 4)), WallState((7, 4)), WallState((9, 4)), WallState((10, 4)), ]) return fourRoomsBuilder.build() FourRooms = build() addToCategory('gridworld', FourRooms) addToCategory('finite-dynamics', FourRooms)
sp = self.nextState(self._state, action) r = self.reward(self._state, action, sp) t = self.terminal(self._state, action, sp) self._state = sp return (r, _transform(sp), t) def setState(self, state: np.ndarray): self._state = state.copy() def copy(self): m = Acrobot(randomize=self.randomize, seed=self._seed) m._state = self._state.copy() m.physical_constants = self.physical_constants m.per_step_constants = self.per_step_constants # copy derivative function because state variables changed m._dsdt = self._dsdt return m class StochasticAcrobot(Acrobot): def __init__(self, seed: int = 0): super().__init__(randomize=True, seed=seed) addToCategory('classic-control', Acrobot) addToCategory('stochastic', StochasticAcrobot)
m.physical_constants = self.physical_constants m.per_step_constants = self.per_step_constants # because we are changing the physics _after_ the derivatives are being precomputed # we need to replace the derivatives with the correct eqns for these constants m._dsdt = self._dsdt return m class StochasticCartpole(Cartpole): def __init__(self, seed: int = 0): super().__init__(randomize=True, seed=seed) class ContinuousActionCartpole(Cartpole): def nextState(self, s: np.ndarray, force: float): # get per-step constants dt = self.per_step_constants['dt'].sample(self.rng) force = np.clip(force, -12, 12) sa = np.append(s, force) spa = euler(self._dsdt, sa, np.array([0, dt])) # only need the last result of the integration spa = spa[-1] sp = spa[:-1] return sp addToCategory('classic-control', Cartpole) addToCategory('stochastic', StochasticCartpole)
# probably there is only one start state # but in case we ever define multiple, then uniform random # chance to transition to any of them for start in starts: K[s, a, start] = 1.0 / len(starts) def build(shape: Coords = (12, 4)): cliffWorldBuilder = GridWorldBuilder(shape) # start in bottom left cliffWorldBuilder.addElement(StartState((0, 0))) # end in bottom right without additional reward cliffWorldBuilder.addElement(GoalState((shape[0] - 1, 0), -1)) # with a cliff in every state in-between cliffWorldBuilder.addElements( [Cliff((x, 0)) for x in range(1, shape[0] - 1)]) cliffWorldBuilder.costToGoal = True return cliffWorldBuilder.build() CliffWorld = build() addToCategory('gridworld', CliffWorld) addToCategory('finite-dynamics', CliffWorld) addToCategory('sutton-barto', CliffWorld)
d0 = immutable(np.array([1.] + [0.] * 12)) # some utility functions to encode other important parts of the problem spec # not necessarily environment specific, but this is as good a place as any to store them def behaviorPolicy(s: int): if s <= 10: return np.array([0.5, 0.5]) return np.array([1.0, 0]) def representationMatrix(): return np.array([ [1, 0, 0, 0 ], # noqa: E241 [0.75, 0.25, 0, 0 ], # noqa: E241 [0.5, 0.5, 0, 0 ], # noqa: E241 [0.25, 0.75, 0, 0 ], # noqa: E241 [0, 1, 0, 0 ], # noqa: E241 [0, 0.75, 0.25, 0 ], # noqa: E241 [0, 0.5, 0.5, 0 ], # noqa: E241 [0, 0.25, 0.75, 0 ], # noqa: E241 [0, 0, 1, 0 ], # noqa: E241 [0, 0, 0.75, 0.25], # noqa: E241 [0, 0, 0.5, 0.5 ], # noqa: E241 [0, 0, 0.25, 0.75], # noqa: E241 [0, 0, 0, 1 ], # noqa: E241 ]) addToCategory('ope', BoyanChain) addToCategory('random-walk', BoyanChain) addToCategory('finite-dynamics', BoyanChain)
m.per_step_constants = self.per_step_constants m._dsdt = self._dsdt return m class GymMountainCar(MountainCar): def __init__(self, seed: int = 0): super().__init__(randomize=False, seed=seed) def nextState(self, s: np.ndarray, a: int): return _nextState(s, a) class StochasticMountainCar(MountainCar): def __init__(self, seed: int = 0): super().__init__(randomize=True, seed=seed) class ContinuousActionMountainCar(MountainCar): def nextState(self, s: np.ndarray, a: float): force = np.clip(a, -3, 3) return self._integrate(s, force) addToCategory('classic-control', MountainCar) addToCategory('sutton-barto', GymMountainCar) addToCategory('stochastic', StochasticMountainCar)