def reset(self): obs = self.env.reset() self.estimator.reset() bel, _ = self._update_belief(action=None, obs=obs) obs = to_one_hot(obs, self.n) self.last_obs = (obs, bel) return {'obs': obs, 'zbel': bel}
def step(self, action): obs, reward, done, info = super(ExplicitBayesRockSample, self).step(action) obs['obs'] = to_one_hot(obs['obs'], self.env.nS) info['expert'] = int(''.join(str(int(x)) for x in self.env.state[2:]), 2) return obs, reward, done, info
def step(self, action): obs, reward, done, info = self.env.step(action) bel, info = self._update_belief(action, obs, **info) obs = to_one_hot(obs, self.n) self.last_obs = (obs, bel) true_prob = self.env.slip_prob[Action.A] # print(true_prob) info['expert'] = np.where( np.array([0.1, 0.2, 0.3, 0.4, 0.5]) == true_prob)[0][0] return {'obs': obs, 'zbel': bel}, reward, done, info
def _augment_observation(self, action, obs, **kwargs): # Estimate self.estimator.estimate(action, obs, **kwargs) mle = self.estimator.get_mle() if not isinstance(mle, np.ndarray): mle = np.array([mle]) if isinstance(self.env.observation_space, Discrete): obs = to_one_hot(obs, self.env.observation_space.n) if not isinstance(obs, np.ndarray): obs = np.array([obs]) kwargs['mle'] = mle if self.augment_to_obs: return np.concatenate([obs, mle], axis=0), kwargs else: return obs, kwargs
def _augment_observation(self, action, obs, **kwargs): # Estimate self.estimator.estimate(action, obs, **kwargs) belief = self.estimator.get_belief() if not isinstance(belief, np.ndarray): belief = np.array([belief]) if isinstance(self.env.observation_space, Discrete): obs = to_one_hot(obs, self.env.observation_space.n) if not isinstance(obs, np.ndarray): obs = np.array([obs]) kwargs['belief'] = belief if self.augment_to_obs: return np.concatenate([obs, belief], axis=0), kwargs else: return obs, kwargs