def sample(self, state, action): if state._context != self.current_outcome.observation._context: self._gym_env.set_state(state._context) outcome = super().step(action) observation = GymDomainStateProxy(state=normalize_and_round(outcome.observation), context=self._gym_env.get_state()) self.current_outcome = EnvironmentOutcome(observation=observation, value=outcome.value, termination=outcome.termination, info=outcome.info) return self.current_outcome
def reset(self): self.current_outcome = EnvironmentOutcome( observation=GymDomainStateProxy(state=super().reset(), context=[]), value=None, termination=False, info=None, ) return self.current_outcome.observation
def sample(self, state, action): if state != self.current_outcome.observation: self.reset() for a in state._context: self.step(a) outcome = self.step(action) observation = GymDomainStateProxy(state=outcome.observation._state, context=state._context + [action]) self.current_outcome = EnvironmentOutcome(observation=observation, value=outcome.value, termination=outcome.termination, info=outcome.info) return self.current_outcome
def get_next_state_distribution(self, state, action): if state != self.current_outcome.observation: self.reset() for a in state._context: self.step(a) outcome = self.step(action) observation = GymDomainStateProxy(state=outcome.observation._state, context=state._context + [action]) self.current_outcome = EnvironmentOutcome(observation=observation, value=outcome.value, termination=outcome.termination, info=outcome.info) return DiscreteDistribution([(observation, 1.0)])
def step(self, action): outcome = super().step(action) observation = GymDomainStateProxy(state=outcome.observation, context=None) return EnvironmentOutcome( observation=observation, value=outcome.value, termination=outcome.termination, info=outcome.info, )
def _sample(self, memory: D.T_memory[D.T_state], action: D.T_agent[D.T_concurrency[D.T_event]]) -> \ EnvironmentOutcome[D.T_agent[D.T_observation], D.T_agent[TransitionValue[D.T_value]], D.T_agent[D.T_info]]: o = super()._sample(memory, action) return EnvironmentOutcome(observation=GymDomainStateProxy( state=normalize_and_round(o.observation._state), context=o.observation._context), value=TransitionValue(reward=o.value.reward - 1), termination=o.termination, info=o.info)
def reset(self): self.current_outcome = EnvironmentOutcome( observation=GymDomainStateProxy( state=normalize_and_round(super().reset()), context=self._gym_env.get_state(), ), value=None, termination=False, info=None, ) return self.current_outcome.observation
def step(self, action): outcome = super().step(action) observation = GymDomainStateProxy( state=normalize_and_round(outcome.observation), context=self._gym_env.get_state(), ) return EnvironmentOutcome( observation=observation, value=outcome.value, termination=outcome.termination, info=outcome.info, )
def get_next_state_distribution(self, state, action): if state._context != self.current_outcome.observation._context: self._gym_env.set_state(state._context) outcome = super().step(action) observation = GymDomainStateProxy( state=normalize_and_round(outcome.observation), context=self._gym_env.get_state(), ) self.current_outcome = EnvironmentOutcome( observation=observation, value=outcome.value, termination=outcome.termination, info=outcome.info, ) return DiscreteDistribution([(observation, 1.0)])
def decode(outcome): return EnvironmentOutcome( observation=MyShmProxy.StateProxy.decode(outcome[0]), value=MyShmProxy.TransitionValueProxy.decode(outcome[1:3]), termination=MyShmProxy.BoolProxy.decode(outcome[3]))
def decode(outcome): return EnvironmentOutcome( observation=GridShmProxy.StateProxy.decode(outcome[0]), value=GridShmProxy.ValueProxy.decode(outcome[1:3]), termination=GridShmProxy.BoolProxy.decode(outcome[3]), )