def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt): dS, dA = visitation_probs.shape samples = np.random.choice(np.arange(dS * dA), size=n_sample, p=visitation_probs.reshape(dS * dA)) policy = get_policy(q_fn, ent_wt=ent_wt) observations = samples // dA actions = samples % dA a_logprobs = np.log(policy[observations, actions]) observations_next = [] for i in range(n_sample): t_distr = env.tabular_trans_distr(observations[i], actions[i]) next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)), p=t_distr), ndim=dS) observations_next.append(next_state) observations_next = np.array(observations_next) return { 'observations': flat_to_one_hot(observations, ndim=dS), 'actions': flat_to_one_hot(actions, ndim=dA), 'a_logprobs': a_logprobs, 'observations_next': observations_next }
def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt): dS, dA = visitation_probs.shape samples = np.random.choice(np.arange(dS*dA), size=n_sample, p=visitation_probs.reshape(dS*dA)) policy = get_policy(q_fn, ent_wt=ent_wt) observations = samples // dA actions = samples % dA a_logprobs = np.log(policy[observations, actions]) observations_next = [] for i in range(n_sample): t_distr = env.tabular_trans_distr(observations[i], actions[i]) next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)), p=t_distr), ndim=dS) observations_next.append(next_state) observations_next = np.array(observations_next) return {'observations': flat_to_one_hot(observations, ndim=dS), 'actions': flat_to_one_hot(actions, ndim=dA), 'a_logprobs': a_logprobs, 'observations_next': observations_next}
def step(self, a): transition_probs = self.transitions[self.cur_state, a] next_state = np.random.choice(np.arange(self.nstates), p=transition_probs) r = self.reward[self.cur_state, a, next_state] self.cur_state = next_state obs = flat_to_one_hot(self.cur_state, ndim=self.nstates) done = False if self.terminate_on_reward and r>0: done = True return obs, r, done, {}
def initial_state_distribution(self): return flat_to_one_hot(self.init_state, ndim=self.nstates)
def reset(self): self.cur_state = self.init_state obs = flat_to_one_hot(self.cur_state, ndim=self.nstates) return obs