from introrl.dp_funcs.dp_policy_iter import dp_policy_iteration from introrl.policy import Policy from introrl.state_values import StateValues from introrl.mdp_data.car_rental import get_env from introrl.utils import pickle_esp env = get_env() policy = Policy(environment=env) policy.intialize_policy_to_random(env=env) state_value = StateValues(env) state_value.init_Vs_to_zero() dp_policy_iteration(policy, state_value, do_summ_print=True, show_start_policy=True, max_iter=1000, err_delta=0.0001, gamma=0.9) diag_colorD = { '5': 'r', '4': 'g', '3': 'b', '2': 'c', '1': 'y', '0': 'w', '-5': 'r', '-4': 'g',
class MyTest(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.gridworld = get_gridworld() self.P = Policy(environment=self.gridworld) self.P.intialize_policy_to_equiprobable(env=self.gridworld) def tearDown(self): unittest.TestCase.tearDown(self) del (self.P) def test_should_always_pass_cleanly(self): """Should always pass cleanly.""" pass def test_myclass_existence(self): """Check that myclass exists""" # See if the self.P object exists self.assertIsInstance(self.P, Policy, msg=None) def test_set_policy_from_default_pi(self): """test set policy from default pi""" policyD = self.gridworld.get_default_policy_desc_dict() self.P.set_policy_from_piD(policyD) self.assertEqual(self.P.get_action_prob((2, 2), 'U'), 1.0) self.assertEqual(self.P.get_action_prob((2, 2), 'R'), 0.0) self.assertEqual(self.P.get_action_prob((2, 2), 'D'), None) #def test_set_policy_from_list_of_actions(self): # """test set policy from list of actions""" # piD = {(0, 0):('R','D') } # self.P.set_policy_from_piD( piD ) # self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None) # self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.5) # self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.5) #def test_set_policy_from_list_of_action_probs(self): # """test set policy from list of action probs""" # piD = {(0, 0):[('R',0.6), ('D',0.4)] } # self.P.set_policy_from_piD( piD ) # self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None) # self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.6) # self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.4) # # make (action, prob) entry too long. # with self.assertRaises(ValueError): # piD = {(0, 0):[('R',0.6,0.4), ('D',0.4,0.6)] } # self.P.set_policy_from_piD( piD ) def test_learn_all_s_and_a(self): """test learn all s and a""" self.P.learn_all_states_and_actions_from_env(self.gridworld) def test_initialize_to_random(self): """test initialize to random""" self.P.intialize_policy_to_random(env=self.gridworld) apL = self.P.get_list_of_all_action_desc_prob((0, 2), incl_zero_prob=True) pL = [p for (adesc, p) in apL] self.assertEqual(sorted(pL), [0.0, 0.0, 1.0]) def test_iterate_adesc_p(self): """test iterate adesc p""" apL = [] for (a_desc, p) in self.P.iter_policy_ap_for_state( (0, 0), incl_zero_prob=False): apL.append((a_desc, p)) self.assertIn(('R', 0.5), apL) self.assertIn(('D', 0.5), apL) self.assertNotIn(('U', 0.5), apL) def test_iterate_all_states(self): """test iterate all states""" sL = [] for s_hash in self.P.iter_all_policy_states(): sL.append(s_hash) sL.sort() self.assertEqual(len(sL), 9) self.assertEqual(sL[0], (0, 0)) self.assertEqual(sL[-1], (2, 3)) def test_get_single_action(self): """test get single action""" a_desc = self.P.get_single_action((0, 0)) self.assertIn(a_desc, ('R', 'D')) a_desc = self.P.get_single_action((99, 99)) self.assertEqual(a_desc, None)
verbosity=0, show_env_states=False) if __name__ == "__main__": # pragma: no cover import sys from introrl.policy import Policy from introrl.state_values import StateValues from introrl.dp_funcs.dp_policy_eval import dp_policy_evaluation from introrl.mdp_data.simple_grid_world import get_gridworld gridworld = get_gridworld() pi = Policy(environment=gridworld) #pi.intialize_policy_to_equiprobable(env=gridworld) pi.intialize_policy_to_random(env=gridworld) #pi.learn_all_states_and_actions_from_env( gridworld ) #pi.set_policy_from_piD( gridworld.get_default_policy_desc_dict() ) # change one action from gridworld default pi.set_sole_action((1, 0), 'D') # is 'U' in default sv = StateValues(gridworld) sv.init_Vs_to_zero() dp_policy_iteration(pi, sv, do_summ_print=True, show_each_policy_change=True, max_iter=1000,
def mc_exploring_starts(environment, initial_policy='default', read_pickle_file='', save_pickle_file='', first_visit=True, do_summ_print=True, showRunningAve=False, fmt_Q='%g', fmt_R='%g', show_initial_policy=True, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, max_episode_steps=10000, iteration_prints=0): """ ... GIVEN AN ENVIRONMENT ... apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY initial_policy can be 'default', 'random', policy_dictionary, Policy object Returns: Policy and ActionValueRunAveColl objects Use Episode Discounted Returns to find Q(s,a), Action-Value Function Terminates when abserr < max_abserr Assume that Q(s,a), action_value_ave, has been initialized prior to call. Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value OBJECTS. """ # create Policy and ActionValueRunAveColl objects policy = Policy(environment=environment) if initial_policy == 'default': print('Initializing Policy to "default" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(environment.get_default_policy_desc_dict()) elif initial_policy == 'random': print('Initializing Policy to "random" in mc_exploring_starts') policy.intialize_policy_to_random(env=environment) elif isinstance(initial_policy, Policy): policy = initial_policy else: print('Initializing Policy to "custom policy" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(initial_policy) action_value_ave = ActionValueRunAveColl(environment) action_value_ave.init_Qsa_to_zero( ) # Terminal states w/o an action are NOT included #action_value_ave.summ_print() if read_pickle_file: policy.init_from_pickle_file(read_pickle_file) action_value_ave.init_from_pickle_file(read_pickle_file) if do_summ_print: if show_initial_policy: print( '=============== STARTING WITH THE INITIAL POLICY ====================' ) policy.summ_print(verbosity=0, environment=environment, show_env_states=False, none_str='*') s = 'Starting a Maximum of %i Monte Carlo Exploring Start Episodes\nfor "%s" with Gamma = %g'%\ (max_num_episodes, environment.name, gamma) banner(s, banner_char='', leftMargin=0, just='center') # create an Episode object for getting returns episode = Episode(environment.name + ' Episode') # set counter and flag num_episodes = 0 keep_looping = True progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria for start_hash in environment.iter_all_action_states(randomize=True): a_descL = environment.get_state_legal_action_list(start_hash) # randomize action order random.shuffle(a_descL) # try every initial action for each start_hash for a_desc in a_descL: # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, environment, environment.terminal_set, episode=episode, first_a_desc=a_desc, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns( gamma=gamma, first_visit=first_visit, visit_type='SA'): # look at each step from episode and calc average Q(s,a) (s, a, r, sn, G) = dr action_value_ave.add_val(s, a, G) aL = environment.get_state_legal_action_list(s) if aL: best_a_desc, best_a_val = aL[0], float('-inf') bestL = [best_a_desc] for a in aL: q = action_value_ave.get_ave(s, a) if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [a] elif q == best_a_val: bestL.append(a) best_a_desc = random.choice(bestL) policy.set_sole_action(s, best_a_desc) abserr = action_value_ave.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%3i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: score = environment.get_policy_score(policy=policy, start_state_hash=None, step_limit=1000) print(out_str, ' score=%s' % str(score), ' = (r_sum, n_steps, msg)', ' estimated err =', abserr) progress_str = out_str if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited MC First-Visit Value Iteration', s) print(' num episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) try: # sims may not have a layout_print environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass if save_pickle_file: policy.save_to_pickle_file(save_pickle_file) action_value_ave.save_to_pickle_file(save_pickle_file) return policy, action_value_ave
def dp_value_iteration(environment, allow_multi_actions=False, do_summ_print=True, fmt_V='%g', fmt_R='%g', max_iter=1000, err_delta=0.001, gamma=0.9, iteration_prints=0): """ ... GIVEN AN ENVIRONMENT ... apply Value Iteration to find the OPTIMAL POLICY Returns: policy and state_value objects Terminates when delta < err_delta * VI_STOP_CRITERIA CREATES BOTH policy AND state_value OBJECTS. If allow_multi_actions is True, policy will include all actions within err_delta of best action. """ # create Policy and StateValues objects policy = Policy(environment=environment) policy.intialize_policy_to_random(env=environment) state_value = StateValues(environment) state_value.init_Vs_to_zero() # Terminal states need to be 0.0 #state_value.summ_print() # set counter and flag loop_counter = 0 all_done = False # value-iteration stopping criteria # if gamme==1.0 value iteration will never stop SO limit to gamma==0.999 stop criteria # (VI terminates if delta < err_delta * VI_STOP_CRITERIA) # (typically err_delta = 0.001) VI_STOP_CRITERIA = max((1.0 - gamma) / gamma, (1.0 - 0.999) / 0.999) error_limit = err_delta * VI_STOP_CRITERIA while (loop_counter < max_iter) and (not all_done): loop_counter += 1 all_done = True delta = 0.0 # used to calc largest change in state_value for s_hash in policy.iter_all_policy_states(): VsD = { } # will hold: index=a_desc, value=V(s) for all transitions of a_desc from s_hash # MUST include currently zero prob actions for a_desc, a_prob in policy.iter_policy_ap_for_state( s_hash, incl_zero_prob=True): calcd_v = 0.0 for sn_hash, t_prob, reward in \ environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False): calcd_v += t_prob * (reward + gamma * state_value(sn_hash)) VsD[a_desc] = calcd_v best_a_desc, best_a_val = argmax_vmax_dict(VsD) delta = max(delta, abs(best_a_val - state_value(s_hash))) state_value[s_hash] = best_a_val if delta > error_limit: all_done = False if iteration_prints and (loop_counter % iteration_prints == 0): print('Loop:%6i' % loop_counter, ' delta=%g' % delta) # Now that State-Values have been determined, set policy for s_hash in policy.iter_all_policy_states(): VsD = { } # will hold: index=a_desc, value=V(s) for all transitions of a_desc from s_hash # MUST include zero prob actions for a_desc, a_prob in policy.iter_policy_ap_for_state( s_hash, incl_zero_prob=True): calcd_v = 0.0 for sn_hash, t_prob, reward in \ environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False): calcd_v += t_prob * (reward + gamma * state_value(sn_hash)) VsD[a_desc] = calcd_v if allow_multi_actions: best_a_list, best_a_val = multi_argmax_vmax_dict( VsD, err_delta=err_delta) policy.set_sole_action(s_hash, best_a_list[0]) # zero all other actions prob = 1.0 / len(best_a_list) for a_desc in best_a_list: policy.set_action_prob(s_hash, a_desc, prob=prob) else: best_a_desc, best_a_val = argmax_vmax_dict(VsD) policy.set_sole_action(s_hash, best_a_desc) if do_summ_print: s = '' if loop_counter >= max_iter: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited Value Iteration', s) print(' iterations =', loop_counter, ' (limit=%i)' % max_iter) print(' measured delta =', delta) print(' gamma =', gamma) print(' err_delta =', err_delta) print(' error limit =', error_limit) print(' STOP CRITERIA =', VI_STOP_CRITERIA) state_value.summ_print(fmt_V=fmt_V) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') return policy, state_value