def set_transition(self, s_hash, a_desc, snext_hash, reward_obj=Reward(const=0.0), action_prob=1.0, trans_prob=1.0): """ Will Set or Add prob and Reward entries to sn_probD and sn_rewardD action_prob controls the probability of picking an action from a list of actions. i.e. if in state s, there can be a list of (a1,p1), (a2,p2), (a3,p3), etc. trans_prob controls the probability of picking next state from a list of next states. i.e. if taking action a in state s, there can be a list of (sn1,p1), (sn2,p2), (sn3,p3), etc. Rewards can vary in a stochastic environment. Reward objects can give constant, weighted tabular, or function-based float reward values. The Reward object is always associated with (s,a,sn), however, the numerical value can vary with probabilty distributions of its own. DELAY NORMALIZING... Allows sequential adding of multipls action/prob pairs. i.e. Merely set the flag "is_normalized" to False to trigger later "normalize" call. """ self.sa_coll.set_action_prob( s_hash, a_desc, prob=action_prob) T = self.get_transition_obj( s_hash, a_desc ) Sn = self.state_coll.get_state_obj( snext_hash ) T.set_transition( Sn, reward_obj=reward_obj, prob=trans_prob)
def add_to_environment(self, env): """Populate an environment object with the collected data about s_hash""" if not self.is_consistent_info(): print( 'WARNING... NOT CONSISTENT. '*3 ) if self.total_action_calls == 0: print('WARNING... No Available ModelStateData to send to Environment') else: for (a_desc, a_count) in self.action_countD.items(): if a_count > 0: # fraction of calls in s_hash using a_desc a_prob = float(a_count) / float(self.total_action_calls) if a_desc in self.action_sn_rD: snD = self.action_sn_rD[ a_desc ] # snD... index=sn_hash: value=rwd_ave_obj for sn_hash, rwd_ave_obj in snD.items(): # fraction of times using a_desc in s_hash resulted in sn_hash t_prob = float(rwd_ave_obj.num_val) / float(a_count) env.TC.set_transition( self.s_hash, a_desc, sn_hash, reward_obj=Reward(const=rwd_ave_obj.get_ave()), action_prob=a_prob, trans_prob=t_prob) # make sure all normalize flags are set in env.TC for (s_hash, a_desc, T) in env.TC.iter_all_transitions(): T.normalize()
def add_event( s_hash, a_desc, sn_hash ): r = rewardD.get( sn_hash, 0.0) TC.set_transition( s_hash, a_desc, sn_hash, reward_obj=Reward(const=r), action_prob=1.0, trans_prob=1.0)
def add_transition(self, a_desc, snext_hash, t_prob, reward_obj): """Add the (sn,tp,R) triplet for the (s,a)""" # catch non-float t_prob if is_float(t_prob): t_prob = floatCast(t_prob) # make sure it's a simple float else: raise ValueError('transition prob: "%s" MUST BE A FLOAT.' % str(t_prob)) # ----------- # allow float inputs for reward... recast as Reward object if is_float(reward_obj): reward_obj = Reward(const=reward_obj) if not isinstance(reward_obj, Reward): raise ValueError( 'reward_obj: "%s" MUST BE A Reward object OR float.' % str(reward_obj)) # ----------- if a_desc not in self.action_snprD: self.action_snprD[a_desc] = { } # snD... index=snext_hash: value=(t_prob, reward_obj) self.action_snprD[a_desc][snext_hash] = (t_prob, reward_obj)
def setUp(self): unittest.TestCase.setUp(self) s = State((2, 2)) a = Action('U') self.T = Transition(s, a) rc = Reward(const=1.1) reward_probL = [(0.0, 1), (1.0, 1), (2.0, 2)] rt = Reward(reward_probL=reward_probL) def my_gauss(): return random.gauss(3.0, 0.5) rf = Reward(reward_dist_func=my_gauss) self.T.set_transition(State((2, 3)), reward_obj=rc, prob=0.8) self.T.set_transition(State((1, 2)), reward_obj=rt, prob=0.1) self.T.set_transition(State((3, 2)), reward_obj=rf, prob=0.1) self.T.set_transition(State((0, 0)), reward_obj=rf, prob=0.0)
def add_transition(self, s_hash, a_desc, snext_hash, t_prob=1.0, reward_obj=Reward(const=0.0)): if s_hash not in self.define_statesD: self.define_statesD[s_hash] = DefineStateMoves(s_hash) self.define_statesD[s_hash].add_transition(a_desc, snext_hash, t_prob, reward_obj)
def test_get_random_transition(self): """test get random transition""" # add another transition self.TC.set_transition((0, 0), 'R', (0, 3), reward_obj=Reward(const=1.0), action_prob=1.0, trans_prob=1.0) snL = [] for i in range(30): Sn = self.TC.get_prob_weighted_next_state((0, 0), 'R') snL.append(Sn.hash) self.assertGreater(snL.count((0, 1)), 5) # should be 15
def test_set_transition_prob(self): """test set transition prob""" # create a second possibility for (0,0), 'R' self.TC.set_transition((0, 0), 'R', (0, 3), reward_obj=Reward(const=1.0), action_prob=1.0, trans_prob=1.0) Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1)) Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3)) self.assertEqual(p1, 0.5) self.assertEqual(p1, p2) self.assertNotEqual(Sn1, Sn2) # set explicitly self.TC.set_transition_prob((0, 0), 'R', (0, 1), prob=0.1) self.TC.set_transition_prob((0, 0), 'R', (0, 3), prob=0.9) Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1)) Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3)) self.assertEqual(p1, 0.1) self.assertEqual(p2, 0.9) # try setting sole prob self.TC.set_sole_transition((0, 0), 'R', (0, 1)) Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1)) Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3)) self.assertEqual(p1, 1.0) self.assertEqual(p2, 0.0) # try sole random self.TC.initialize_sole_random((0, 0), 'R') Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1)) Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3)) pL = sorted([p1, p2]) self.assertEqual(pL, [0.0, 1.0]) # try equiprobable self.TC.intialize_to_equiprobable( (0, 0), 'R', ) Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1)) Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3)) self.assertEqual(p1, 0.5) self.assertEqual(p1, p2) self.assertNotEqual(Sn1, Sn2)
def set_transition(self, next_state_obj, reward_obj=Reward(const=0.0), prob=1.0): """ Will Set or Add prob and Reward entries to sn_probD and sn_rewardD DELAY NORMALIZING... Allows sequential adding of multipls action/prob pairs. i.e. Merely set the flag "is_normalized" to False to trigger later "normalize" call later. """ prob = float(prob) self.sn_probD[next_state_obj] = float( prob) # index=action_obj: value=probability of action self.sn_hashD[next_state_obj.hash] = next_state_obj self.sn_rewardD[next_state_obj] = reward_obj self.is_normalized = False
def setUp(self): unittest.TestCase.setUp(self) self.TC = TransitionColl() actionD = { (0, 0): ('D', 'R'), (0, 1): ('L', 'R'), (0, 2): ('L', 'D', 'R'), (1, 0): ('U', 'D'), (1, 2): ('U', 'D', 'R'), (2, 0): ('U', 'R'), (2, 1): ('L', 'R'), (2, 2): ('L', 'R', 'U'), (2, 3): ('L', 'U') } rewardD = {(0, 3): 1, (1, 3): -1} for state_hash, actionL in actionD.items(): for action_desc in actionL: a = action_desc s = state_hash if a == 'U': snext_hash = (s[0] - 1, s[1]) elif a == 'D': snext_hash = (s[0] + 1, s[1]) elif a == 'R': snext_hash = (s[0], s[1] + 1) elif a == 'L': snext_hash = (s[0], s[1] - 1) reward_val = rewardD.get(snext_hash, 0.0) self.TC.set_transition(s, a, snext_hash, reward_obj=Reward(const=reward_val), action_prob=1.0, trans_prob=1.0)
def test_get_list_of_next_state(self): """test get list of next state prob""" snL = self.TC.get_list_of_all_next_state((0, 0), 'R', incl_zero_prob=False) self.assertEqual(len(snL), 1) snL = self.TC.get_list_of_all_next_state((0, 0), 'R', incl_zero_prob=True) self.assertEqual(len(snL), 1) # add another transition self.TC.set_transition((0, 0), 'R', (0, 3), reward_obj=Reward(const=1.0), action_prob=1.0, trans_prob=1.0) snL = self.TC.get_list_of_all_next_state((0, 0), 'R', incl_zero_prob=False) self.assertEqual(len(snL), 2) snL = self.TC.get_list_of_all_next_state((0, 0), 'R', incl_zero_prob=True) self.assertEqual(len(snL), 2) # make one transition prob zero self.TC.initialize_sole_random((0, 0), 'R') snL = self.TC.get_list_of_all_next_state((0, 0), 'R', incl_zero_prob=False) self.assertEqual(len(snL), 1) snL = self.TC.get_list_of_all_next_state((0, 0), 'R', incl_zero_prob=True) self.assertEqual(len(snL), 2)
def test_remove_next_state(self): """test remove next state""" self.TC.set_transition((0, 0), 'R', (0, 3), reward_obj=Reward(const=1.0), action_prob=1.0, trans_prob=1.0) Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1)) Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3)) self.assertEqual(p1, 0.5) self.assertEqual(p1, p2) self.assertNotEqual(Sn1, Sn2) # now remove original transition self.TC.remove_next_state((0, 0), 'R', (0, 1)) Sn1, p1 = self.TC.get_next_state_prob((0, 0), 'R', (0, 1)) Sn2, p2 = self.TC.get_next_state_prob((0, 0), 'R', (0, 3)) self.assertEqual(p1, None) self.assertNotEqual(p1, p2) self.assertNotEqual(Sn1, Sn2) self.assertEqual(p2, 1.0)
def test_iter_transitions(self): """test iter transitions""" # add another transition self.TC.set_transition((0, 0), 'R', (0, 3), reward_obj=Reward(const=1.0), action_prob=1.0, trans_prob=1.0) spL = [] for (Sn, p) in self.TC.iter_next_state_prob((0, 0), 'R', incl_zero_prob=False): spL.append(p) self.assertEqual(spL, [0.5, 0.5]) # make one transition prob zero self.TC.initialize_sole_random((0, 0), 'R') spL = [] for (Sn, p) in self.TC.iter_next_state_prob((0, 0), 'R', incl_zero_prob=False): spL.append(p) self.assertEqual(spL, [1.0]) spL = [] for (Sn, p) in self.TC.iter_next_state_prob((0, 0), 'R', incl_zero_prob=True): spL.append(p) self.assertEqual(sorted(spL), [0.0, 1.0])
for (prob, Sn) in sorted([(prob, Sn) for (Sn, prob) in self.sn_probD.items()], reverse=True): R = self.sn_rewardD[Sn] print(' %9s' % str(Sn.hash), '%6g' % prob, ' %s' % str(R)[1:-1]) if __name__ == "__main__": # pragma: no cover from introrl.action import Action from introrl.state import State s = State((2, 2)) a = Action('U') T = Transition(s, a) rc = Reward(const=1.1) reward_probL = [(0.0, 1), (1.0, 1), (2.0, 2)] rt = Reward(reward_probL=reward_probL) def my_gauss(): return random.gauss(3.0, 0.5) rf = Reward(reward_dist_func=my_gauss) T.set_transition(State((2, 3)), reward_obj=rc, prob=0.8) T.set_transition(State((1, 2)), reward_obj=rt, prob=0.1) T.set_transition(State((3, 2)), reward_obj=rf, prob=0.1) T.set_transition(State((0, 0)), reward_obj=rc, prob=0.0) T.summ_print()
a_prob=1.0) # a_prob will be normalized sn = state_hash + action_desc self.add_transition(state_hash, action_desc, sn, t_prob=1.0, reward_obj=0.0) self.define_env_states_actions( ) # send all states and actions to environment self.start_state_hash = 12 self.layout = GenericLayout(self) reward_probL = [(0.0, 1), (1.0, 1)] # will be normalized in use. rt = Reward(reward_probL=reward_probL) class TinyEnv(EnvBaseline): def __init__(self, name='Tiny Env'): EnvBaseline.__init__(self, name=name) def define_environment(self): for state_hash in range(1, 3): for action_desc in [-1, 1]: self.add_action(state_hash, action_desc, a_prob=1.0) # a_prob will be normalized sn = state_hash + action_desc self.add_transition(state_hash, action_desc, sn,
def setUp(self): unittest.TestCase.setUp(self) self.Rc = Reward( const=1.1 ) self.Rt = Reward( reward_probL=reward_probL ) self.Rf = Reward( reward_dist_func=my_gauss )