def reward(self, s=None, a=None, ns=None): t_dist = self.transition_reward_dist(s, a) r_dist = {} for (ns_, r), p in t_dist.items(): if ns_ == ns: r_dist[r] = r_dist.get(r, 0) + p return sample_prob_dict(r_dist)
def act(self, s, softmax_temp=None, randchoose=None): actions = self.ham.available_actions(s) a_q = {} for a in actions: q = self._qval(s, a) # if q == -np.inf: #never take actions that machine loop # continue a_q[a] = q if softmax_temp is None: softmax_temp = self.softmax_temp if randchoose is None: randchoose = self.randchoose adist = calc_esoftmax_dist(a_q, temp=softmax_temp, randchoose=randchoose) return sample_prob_dict(adist)
def transition(self, s, a): return sample_prob_dict(self.transition_reward_dist(s, a))[0]
def reward(self, s=None, a=None, ns=None): return sample_prob_dict(self.reward_dist(s, a, ns))
def observation(self, a: Action, ns: State) -> Observation: return sample_prob_dict(self.observation_dist(a, ns))
def get_init_state(self): return sample_prob_dict(self.get_init_state_dist())
def act(self, s, softmax_temp=None, randchoose=None): adist = self.act_dist(s, softmax_temp, randchoose) return sample_prob_dict(adist)
def transition_reward(self, s=None, a=None): return sample_prob_dict(self.transition_reward_dist(s, a))
def get_init_state(self): if len(self.init_state_dist) == 0: raise ValueError("No initial state defined") return sample_prob_dict(self.init_state_dist)
def transition(self, s: "state", ja: "joint action"): return sample_prob_dict(self.transition_reward_dist(s, ja))[0]