class BayesHumanoidPushingEnv(BayesEnv): # Wrapper envs for mujoco envs def __init__(self, reset_params=True): env = HumanoidPushingEnv() self.estimator = ParamEnvDiscreteEstimator(env, discretization=discretization) self.env_sampler = DiscreteParamEnvSampler(env, discretization) super(BayesHumanoidPushingEnv, self).__init__(env, self.estimator) self.nominal_env = env self.reset_params = reset_params def reset(self): if self.reset_params: self.env = self.env_sampler.sample() return super().reset() def step(self, action): prev_belief = self.estimator.get_belief() prev_state = self.env.get_state() obs, reward, done, info = self.env.step(action) info['prev_state'] = prev_state info['curr_state'] = self.env.get_state() # Estimate self.estimator.estimate(action, obs, **info) belief = self.estimator.get_belief() info['belief'] = belief obs = np.concatenate([obs, belief], axis=0) return obs, reward, done, info
class ExplicitBayesHumanoidPushingEnv(ExplicitBayesEnv): def __init__(self, reset_params=True): env = HumanoidPushingEnv() self.estimator = ParamEnvDiscreteEstimator(env, discretization=discretization) self.env_sampler = DiscreteParamEnvSampler(env, discretization) self.env_sampler.param_space['friction'] super(ExplicitBayesHumanoidPushingEnv, self).__init__(env, self.estimator) self.nominal_env = env self.observation_space = Dict( {"obs": env.observation_space, "zbel": self.estimator.belief_space}) self.internal_observation_space = env.observation_space self.env = env self.reset_params = reset_params def _update_belief(self, action, obs, **kwargs): # Estimate self.estimator.estimate( action, obs, **kwargs) belief = self.estimator.get_belief() return belief, kwargs def step(self, action): prev_state = self.env.get_state() obs, reward, done, info = self.env.step(action) info['prev_state'] = prev_state info['curr_state'] = self.env.get_state() bel, info = self._update_belief( action, obs, **info) true_param = self.env.get_params() friction = true_param['friction'] exp1 = np.argwhere(self.env_sampler.param_sampler_space['friction'] == friction)[0,0] exp_id = exp1 info['expert'] = exp_id return {'obs':obs, 'zbel':bel}, reward, done, info def reset(self): if self.reset_params: self.env = self.env_sampler.sample() obs = self.env.reset() self.estimator.reset() bel, _ = self._update_belief(action=None, obs=obs) self.last_obs = (obs, bel) return {'obs':obs, 'zbel':bel}