Esempio n. 1
0
 def reward(self, s=None, a=None, ns=None):
     t_dist = self.transition_reward_dist(s, a)
     r_dist = {}
     for (ns_, r), p in t_dist.items():
         if ns_ == ns:
             r_dist[r] = r_dist.get(r, 0) + p
     return sample_prob_dict(r_dist)
Esempio n. 2
0
    def act(self, s, softmax_temp=None, randchoose=None):
        actions = self.ham.available_actions(s)
        a_q = {}
        for a in actions:
            q = self._qval(s, a)
            # if q == -np.inf: #never take actions that machine loop
            #     continue
            a_q[a] = q

        if softmax_temp is None:
            softmax_temp = self.softmax_temp
        if randchoose is None:
            randchoose = self.randchoose

        adist = calc_esoftmax_dist(a_q,
                                   temp=softmax_temp,
                                   randchoose=randchoose)
        return sample_prob_dict(adist)
Esempio n. 3
0
 def transition(self, s, a):
     return sample_prob_dict(self.transition_reward_dist(s, a))[0]
Esempio n. 4
0
 def reward(self, s=None, a=None, ns=None):
     return sample_prob_dict(self.reward_dist(s, a, ns))
Esempio n. 5
0
 def observation(self, a: Action, ns: State) -> Observation:
     return sample_prob_dict(self.observation_dist(a, ns))
Esempio n. 6
0
 def get_init_state(self):
     return sample_prob_dict(self.get_init_state_dist())
Esempio n. 7
0
 def act(self, s, softmax_temp=None, randchoose=None):
     adist = self.act_dist(s, softmax_temp, randchoose)
     return sample_prob_dict(adist)
Esempio n. 8
0
 def transition_reward(self, s=None, a=None):
     return sample_prob_dict(self.transition_reward_dist(s, a))
Esempio n. 9
0
 def get_init_state(self):
     if len(self.init_state_dist) == 0:
         raise ValueError("No initial state defined")
     return sample_prob_dict(self.init_state_dist)
Esempio n. 10
0
 def transition(self,
                s: "state",
                ja: "joint action"):
     return sample_prob_dict(self.transition_reward_dist(s, ja))[0]