def get_policy(self): policy = Policy(environment=self.environment) for s_hash in self.environment.iter_all_action_states(): a_desc = self.get_best_greedy_action(s_hash) policy.set_sole_action(s_hash, a_desc) return policy
def qlearning_epsilon_greedy( environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g', pcent_progress_print=10, show_banner=True, max_num_episodes=sys.maxsize, min_num_episodes=10, max_abserr=0.001, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, const_epsilon=True, epsilon_half_life=200, alpha=0.1, const_alpha=True, alpha_half_life=200, N_episodes_wo_decay=0): """ ... GIVEN AN ENVIRONMENT ... apply Q-Learning Temporal Difference to find the OPTIMAL POLICY and STATE VALUES Returns: Policy and ActionValueColl objects Use Episode Discounted Returns to find V(s), State-Value Function Terminates when abserr < max_abserr Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value_coll OBJECTS. """ # create EpsilonGreedy, Alpha and ActionValueColl objects eg = EpsilonGreedy(epsilon=epsilon, const_epsilon=const_epsilon, half_life=epsilon_half_life, N_episodes_wo_decay=N_episodes_wo_decay) alpha_obj = Alpha(alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life) if initial_action_value_coll is None: action_value_coll = ActionValueColl(environment, init_val=initial_Qsa) else: action_value_coll = initial_action_value_coll #action_value_coll.summ_print() num_s_hash = len(environment.get_all_action_state_hashes()) if read_pickle_file: action_value_coll.init_from_pickle_file(read_pickle_file) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) eg.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i Q-Learning Epsilon Greedy Episodes'%max_num_episodes +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, gamma, alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center') # Iterate over a list of known possible start states if use_list_of_start_states: loop_stateL = environment.limited_start_state_list() else: loop_stateL = [environment.start_state_hash] if show_banner: print( '======================= Iterating over Start States ==================================' ) print(loop_stateL) print( '======================================================================================' ) # set counter and flag episode_loop_counter = 0 keep_looping = True progress_str = '' while (episode_loop_counter <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria Nterminal_episodes = set( ) # tracks if ended at terminal_set or max_num_episodes for start_hash in loop_stateL: episode_loop_counter += 1 if episode_loop_counter > max_num_episodes: break if learn_tracker is not None: learn_tracker.add_new_episode() s_hash = start_hash for n_episode_steps in range(max_episode_steps): a_desc = action_value_coll.get_best_eps_greedy_action( s_hash, epsgreedy_obj=eg) # Begin an episode if a_desc is None: Nterminal_episodes.add(start_hash) print('break for a_desc==None') break else: sn_hash, reward = environment.get_action_snext_reward( s_hash, a_desc) if learn_tracker is not None: learn_tracker.add_sarsn_to_current_episode( s_hash, a_desc, reward, sn_hash) if sn_hash is None: Nterminal_episodes.add(start_hash) print('break for sn_hash==None, #steps=', n_episode_steps, ' s_hash=%s' % str(s_hash), ' a_desc=%s' % str(a_desc)) break else: action_value_coll.qlearning_update(s_hash=s_hash, a_desc=a_desc, sn_hash=sn_hash, alpha=alpha_obj(), gamma=gamma, reward=reward) if sn_hash in environment.terminal_set: Nterminal_episodes.add(start_hash) if (n_episode_steps == 0) and (num_s_hash > 2): print( '1st step break for sn_hash in terminal_set', sn_hash, ' s_hash=%s' % str(s_hash), ' a_desc=%s' % str(a_desc)) break s_hash = sn_hash # increment episode counter on EpsilonGreedy and Alpha objects eg.inc_N_episodes() alpha_obj.inc_N_episodes() abserr = action_value_coll.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if episode_loop_counter < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(episode_loop_counter) / float(max_num_episodes) if pcent_progress_print > 0: out_str = '%3i%%' % (pcent_progress_print * (int(pc_done / float(pcent_progress_print)))) else: out_str = progress_str if out_str != progress_str: print(out_str, end=' ') print('Nterminal episodes =', len(Nterminal_episodes), ' of ', len(loop_stateL)) progress_str = out_str policy = Policy(environment=environment) for s_hash in environment.iter_all_action_states(): a_desc = action_value_coll.get_best_eps_greedy_action( s_hash, epsgreedy_obj=None) policy.set_sole_action(s_hash, a_desc) if do_summ_print: s = '' if episode_loop_counter >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited Epsilon Greedy, TD(0) Value Iteration', s) print(' # episodes =', episode_loop_counter, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) print('Nterminal episodes =', len(Nterminal_episodes), ' of ', len(loop_stateL)) action_value_coll.summ_print(show_last_change=show_last_change, fmt_Q=fmt_Q) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) try: # sims may not have a layout_print environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass print( '================== EPSILON GREEDY DEFINED AS ========================' ) eg.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) alpha_obj.summ_print() if save_pickle_file: policy.save_to_pickle_file(save_pickle_file) action_value_coll.save_to_pickle_file(save_pickle_file) return policy, action_value_coll #, steps_per_episodeL, reward_sum_per_episodeL
print('_____________ Value Iteration ________________') else: print('_____________ Policy Iteration ________________') for gamma in (0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999): if do_VI: policy, sv = dp_value_iteration(robot, do_summ_print=False, fmt_V='%.1f', max_iter=1000, err_delta=0.001, gamma=gamma) else: policy = Policy(environment=robot) policy.set_policy_from_piD(robot.get_default_policy_desc_dict()) sv = StateValues(robot) sv.init_Vs_to_zero() dp_policy_iteration(policy, sv, do_summ_print=False, max_iter=1000, err_delta=0.001, gamma=gamma) print('gamma=%5g' % gamma, ' Fallen=', policy.get_single_action('Fallen'), ' Moving=', policy.get_single_action('Moving'), ' Standing=', policy.get_single_action('Standing'), ' Fallen=',
if show_last_change: print(' Last Delta = %s' % self.last_delta_VsD.get(s_hash, None)) else: print() if __name__ == "__main__": # pragma: no cover from introrl.policy import Policy from introrl.mdp_data.simple_grid_world import get_gridworld gridworld = get_gridworld() policyD = gridworld.get_default_policy_desc_dict() pi = Policy(environment=gridworld) #pi.learn_all_states_and_actions_from_env( gridworld ) pi.set_policy_from_piD(policyD) # ------------- sv = StateValueColl(gridworld) for _ in range(10): sv.mc_update((0, 0), 0.2, 2.0) sv.mc_update((0, 0), 0.2, 3.0) sv.mc_update((0, 1), 0.5, 1.0) print('Value at (0,0) is:', sv.get_Vs((0, 0))) print('get_biggest_action_state_err = ', sv.get_biggest_action_state_err(), '%')
from introrl.mc_funcs.mc_fv_prediction import mc_first_visit_prediction from introrl.black_box_sims.blackjack_sim import BlackJackSimulation from introrl.policy import Policy from introrl.agent_supt.state_value_run_ave_coll import StateValueRunAveColl BJ = BlackJackSimulation() pi = Policy(environment=BJ) # default policy is hit on everything except 20 & 21. pi.set_policy_from_piD(BJ.get_default_policy_desc_dict()) sv = StateValueRunAveColl(BJ) if 1: mc_first_visit_prediction(pi, sv, max_num_episodes=10000, max_abserr=0.001, gamma=1.0) sv.save_to_pickle_file(fname='mc_blackjack_10000_eval') else: mc_first_visit_prediction(pi, sv, max_num_episodes=500000, max_abserr=0.001, gamma=1.0) sv.save_to_pickle_file(fname='mc_blackjack_500000_eval')
from introrl.policy import Policy from introrl.black_box_sims.racetrack_1_sim import RaceTrack_1 RT = RaceTrack_1() sca = Policy(environment=RT) sca.add_state_action((25, 7, 0, 1)) sca.set_action_prob((25, 7, 0, 1), (1, 1), prob=1.0) #sca.summ_print() SA = sca.get_SA_object((25, 7, 0, 1)) print(SA) SA.summ_print() print('-' * 55) sca.set_policy_from_piD(RT.get_default_policy_desc_dict()) SA = sca.get_SA_object((25, 7, 0, 1)) print(SA) SA.summ_print()
import matplotlib import matplotlib.pyplot as plt from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction from introrl.policy import Policy from introrl.agent_supt.state_value_coll import StateValueColl from introrl.mdp_data.random_walk_mrp import get_random_walk rw_mrp = get_random_walk() policy = Policy( environment=rw_mrp ) fig, ax = plt.subplots() true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0} for alpha in [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15]: resultLL = [] # a list of result lists for loop in range(100): # average rms curves over 100 runs sv = StateValueColl( rw_mrp, init_val=0.5 ) resultL, value_snapD = mc_every_visit_prediction( policy, sv, all_start_states=False, do_summ_print=False, show_last_change=False, show_banner=False, max_episode_steps=1000, alpha=alpha, const_alpha=True, alpha_half_life=200, max_num_episodes=100, min_num_episodes=100, max_abserr=0.001, gamma=1.0, result_list='rms', true_valueD=true_valueD) resultLL.append( resultL ) #print( 'sv.calc_rms_error(true_valueD) =', sv.calc_rms_error(true_valueD) )
from introrl.black_box_sims.random_walk_1000 import RandomWalk_1000Simulation from introrl.agent_supt.episode_maker import make_episode from introrl.policy import Policy NUM_EPISODES = 100000 countD = {} # index=state, value=count RW = RandomWalk_1000Simulation() policy = Policy(environment=RW) policy.intialize_policy_to_equiprobable( env=RW ) for Nepi in range(NUM_EPISODES): episode = make_episode(500, policy, RW, max_steps=10000) for dr in episode.get_rev_discounted_returns( gamma=1.0 ): (s_hash, a_desc, reward, sn_hash, G) = dr countD[ s_hash ] = countD.get( s_hash, 0 ) + 1 SUM_VISITS = sum( list(countD.values()) ) freqL = [] for i in range(1,1001): freqL.append( countD.get(i,0) / float(SUM_VISITS) ) # copy and paste list into plot script print('freqL =', repr(freqL))
from introrl.dp_funcs.dp_policy_iter import dp_policy_iteration from introrl.policy import Policy from introrl.state_values import StateValues from introrl.mdp_data.car_rental_const_rtn import get_env from introrl.utils import pickle_esp env = get_env() policy = Policy(environment=env) policy.intialize_policy_to_random(env=env) state_value = StateValues(env) state_value.init_Vs_to_zero() dp_policy_iteration(policy, state_value, do_summ_print=True, show_start_policy=True, max_iter=1000, err_delta=0.0001, gamma=0.9) pickle_esp.save_to_pickle_file(fname='dp_car_rental_PI_const_rtn', env=env, state_values=state_value, policy=policy) state_value.summ_print(fmt_V='%.1f') policy.save_diagram(env, inp_colorD={
if 1: pi, av = mc_epsilon_greedy(RT, initial_policy='default', first_visit=True, do_summ_print=False, showRunningAve=False, fmt_Q='%g', fmt_R='%g', show_initial_policy=False, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, iteration_prints=0, max_episode_steps=10000, epsilon=0.1, const_epsilon=True, half_life=500, N_episodes_wo_decay=0) pi.save_to_pickle_file('racetrack_2_sim') else: pi = Policy(environment=RT) pi.init_from_pickle_file('racetrack_2_sim') fig, ax = plt.subplots() RT.plot_policy(ax, pi) plt.show() fig.savefig("racetrack_2_sim.png")
from introrl.dp_funcs.dp_policy_eval import dp_policy_evaluation from introrl.policy import Policy from introrl.state_values import StateValues from introrl.mdp_data.sutton_ex4_1_grid import get_gridworld gridworld = get_gridworld() pi = Policy(environment=gridworld) pi.intialize_policy_to_equiprobable(env=gridworld) sv = StateValues(gridworld) sv.init_Vs_to_zero() dp_policy_evaluation(pi, sv, max_iter=1000, err_delta=0.001, gamma=1., fmt_V='%.1f') #sv.summ_print( fmt_V='%.3f', show_states=False ) pi.summ_print(environment=gridworld, verbosity=0, show_env_states=False) #print( gridworld.get_info() )
def mc_exploring_starts(environment, initial_policy='default', read_pickle_file='', save_pickle_file='', first_visit=True, do_summ_print=True, showRunningAve=False, fmt_Q='%g', fmt_R='%g', show_initial_policy=True, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, max_episode_steps=10000, iteration_prints=0): """ ... GIVEN AN ENVIRONMENT ... apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY initial_policy can be 'default', 'random', policy_dictionary, Policy object Returns: Policy and ActionValueRunAveColl objects Use Episode Discounted Returns to find Q(s,a), Action-Value Function Terminates when abserr < max_abserr Assume that Q(s,a), action_value_ave, has been initialized prior to call. Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value OBJECTS. """ # create Policy and ActionValueRunAveColl objects policy = Policy(environment=environment) if initial_policy == 'default': print('Initializing Policy to "default" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(environment.get_default_policy_desc_dict()) elif initial_policy == 'random': print('Initializing Policy to "random" in mc_exploring_starts') policy.intialize_policy_to_random(env=environment) elif isinstance(initial_policy, Policy): policy = initial_policy else: print('Initializing Policy to "custom policy" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(initial_policy) action_value_ave = ActionValueRunAveColl(environment) action_value_ave.init_Qsa_to_zero( ) # Terminal states w/o an action are NOT included #action_value_ave.summ_print() if read_pickle_file: policy.init_from_pickle_file(read_pickle_file) action_value_ave.init_from_pickle_file(read_pickle_file) if do_summ_print: if show_initial_policy: print( '=============== STARTING WITH THE INITIAL POLICY ====================' ) policy.summ_print(verbosity=0, environment=environment, show_env_states=False, none_str='*') s = 'Starting a Maximum of %i Monte Carlo Exploring Start Episodes\nfor "%s" with Gamma = %g'%\ (max_num_episodes, environment.name, gamma) banner(s, banner_char='', leftMargin=0, just='center') # create an Episode object for getting returns episode = Episode(environment.name + ' Episode') # set counter and flag num_episodes = 0 keep_looping = True progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria for start_hash in environment.iter_all_action_states(randomize=True): a_descL = environment.get_state_legal_action_list(start_hash) # randomize action order random.shuffle(a_descL) # try every initial action for each start_hash for a_desc in a_descL: # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, environment, environment.terminal_set, episode=episode, first_a_desc=a_desc, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns( gamma=gamma, first_visit=first_visit, visit_type='SA'): # look at each step from episode and calc average Q(s,a) (s, a, r, sn, G) = dr action_value_ave.add_val(s, a, G) aL = environment.get_state_legal_action_list(s) if aL: best_a_desc, best_a_val = aL[0], float('-inf') bestL = [best_a_desc] for a in aL: q = action_value_ave.get_ave(s, a) if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [a] elif q == best_a_val: bestL.append(a) best_a_desc = random.choice(bestL) policy.set_sole_action(s, best_a_desc) abserr = action_value_ave.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%3i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: score = environment.get_policy_score(policy=policy, start_state_hash=None, step_limit=1000) print(out_str, ' score=%s' % str(score), ' = (r_sum, n_steps, msg)', ' estimated err =', abserr) progress_str = out_str if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited MC First-Visit Value Iteration', s) print(' num episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) try: # sims may not have a layout_print environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass if save_pickle_file: policy.save_to_pickle_file(save_pickle_file) action_value_ave.save_to_pickle_file(save_pickle_file) return policy, action_value_ave
print(' --> Final Policy AFTER POLICY ITERATION <--') policy.summ_print(environment=state_value.environment, verbosity=0, show_env_states=False) if __name__ == "__main__": # pragma: no cover import sys from introrl.policy import Policy from introrl.state_values import StateValues from introrl.dp_funcs.dp_policy_eval import dp_policy_evaluation from introrl.mdp_data.simple_grid_world import get_gridworld gridworld = get_gridworld() pi = Policy(environment=gridworld) #pi.intialize_policy_to_equiprobable(env=gridworld) pi.intialize_policy_to_random(env=gridworld) #pi.learn_all_states_and_actions_from_env( gridworld ) #pi.set_policy_from_piD( gridworld.get_default_policy_desc_dict() ) # change one action from gridworld default pi.set_sole_action((1, 0), 'D') # is 'U' in default sv = StateValues(gridworld) sv.init_Vs_to_zero() dp_policy_iteration(pi, sv,
def dp_value_iteration(environment, allow_multi_actions=False, do_summ_print=True, fmt_V='%g', fmt_R='%g', max_iter=1000, err_delta=0.001, gamma=0.9, iteration_prints=0): """ ... GIVEN AN ENVIRONMENT ... apply Value Iteration to find the OPTIMAL POLICY Returns: policy and state_value objects Terminates when delta < err_delta * VI_STOP_CRITERIA CREATES BOTH policy AND state_value OBJECTS. If allow_multi_actions is True, policy will include all actions within err_delta of best action. """ # create Policy and StateValues objects policy = Policy(environment=environment) policy.intialize_policy_to_random(env=environment) state_value = StateValues(environment) state_value.init_Vs_to_zero() # Terminal states need to be 0.0 #state_value.summ_print() # set counter and flag loop_counter = 0 all_done = False # value-iteration stopping criteria # if gamme==1.0 value iteration will never stop SO limit to gamma==0.999 stop criteria # (VI terminates if delta < err_delta * VI_STOP_CRITERIA) # (typically err_delta = 0.001) VI_STOP_CRITERIA = max((1.0 - gamma) / gamma, (1.0 - 0.999) / 0.999) error_limit = err_delta * VI_STOP_CRITERIA while (loop_counter < max_iter) and (not all_done): loop_counter += 1 all_done = True delta = 0.0 # used to calc largest change in state_value for s_hash in policy.iter_all_policy_states(): VsD = { } # will hold: index=a_desc, value=V(s) for all transitions of a_desc from s_hash # MUST include currently zero prob actions for a_desc, a_prob in policy.iter_policy_ap_for_state( s_hash, incl_zero_prob=True): calcd_v = 0.0 for sn_hash, t_prob, reward in \ environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False): calcd_v += t_prob * (reward + gamma * state_value(sn_hash)) VsD[a_desc] = calcd_v best_a_desc, best_a_val = argmax_vmax_dict(VsD) delta = max(delta, abs(best_a_val - state_value(s_hash))) state_value[s_hash] = best_a_val if delta > error_limit: all_done = False if iteration_prints and (loop_counter % iteration_prints == 0): print('Loop:%6i' % loop_counter, ' delta=%g' % delta) # Now that State-Values have been determined, set policy for s_hash in policy.iter_all_policy_states(): VsD = { } # will hold: index=a_desc, value=V(s) for all transitions of a_desc from s_hash # MUST include zero prob actions for a_desc, a_prob in policy.iter_policy_ap_for_state( s_hash, incl_zero_prob=True): calcd_v = 0.0 for sn_hash, t_prob, reward in \ environment.iter_next_state_prob_reward(s_hash, a_desc, incl_zero_prob=False): calcd_v += t_prob * (reward + gamma * state_value(sn_hash)) VsD[a_desc] = calcd_v if allow_multi_actions: best_a_list, best_a_val = multi_argmax_vmax_dict( VsD, err_delta=err_delta) policy.set_sole_action(s_hash, best_a_list[0]) # zero all other actions prob = 1.0 / len(best_a_list) for a_desc in best_a_list: policy.set_action_prob(s_hash, a_desc, prob=prob) else: best_a_desc, best_a_val = argmax_vmax_dict(VsD) policy.set_sole_action(s_hash, best_a_desc) if do_summ_print: s = '' if loop_counter >= max_iter: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited Value Iteration', s) print(' iterations =', loop_counter, ' (limit=%i)' % max_iter) print(' measured delta =', delta) print(' gamma =', gamma) print(' err_delta =', err_delta) print(' error limit =', error_limit) print(' STOP CRITERIA =', VI_STOP_CRITERIA) state_value.summ_print(fmt_V=fmt_V) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') return policy, state_value
from introrl.dp_funcs.dp_policy_iter import dp_policy_iteration from introrl.policy import Policy from introrl.state_values import StateValues from introrl.mdp_data.car_rental import get_env from introrl.utils import pickle_esp env = get_env() policy = Policy(environment=env) policy.intialize_policy_to_random(env=env) state_value = StateValues(env) state_value.init_Vs_to_zero() dp_policy_iteration(policy, state_value, do_summ_print=True, show_start_policy=True, max_iter=1000, err_delta=0.0001, gamma=0.9) diag_colorD = { '5': 'r', '4': 'g', '3': 'b', '2': 'c', '1': 'y', '0': 'w', '-5': 'r', '-4': 'g',
class MyTest(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.gridworld = get_gridworld() self.P = Policy(environment=self.gridworld) self.P.intialize_policy_to_equiprobable(env=self.gridworld) def tearDown(self): unittest.TestCase.tearDown(self) del (self.P) def test_should_always_pass_cleanly(self): """Should always pass cleanly.""" pass def test_myclass_existence(self): """Check that myclass exists""" # See if the self.P object exists self.assertIsInstance(self.P, Policy, msg=None) def test_set_policy_from_default_pi(self): """test set policy from default pi""" policyD = self.gridworld.get_default_policy_desc_dict() self.P.set_policy_from_piD(policyD) self.assertEqual(self.P.get_action_prob((2, 2), 'U'), 1.0) self.assertEqual(self.P.get_action_prob((2, 2), 'R'), 0.0) self.assertEqual(self.P.get_action_prob((2, 2), 'D'), None) #def test_set_policy_from_list_of_actions(self): # """test set policy from list of actions""" # piD = {(0, 0):('R','D') } # self.P.set_policy_from_piD( piD ) # self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None) # self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.5) # self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.5) #def test_set_policy_from_list_of_action_probs(self): # """test set policy from list of action probs""" # piD = {(0, 0):[('R',0.6), ('D',0.4)] } # self.P.set_policy_from_piD( piD ) # self.assertEqual(self.P.get_action_prob( (0,0), 'U'), None) # self.assertEqual(self.P.get_action_prob( (0,0), 'R'), 0.6) # self.assertEqual(self.P.get_action_prob( (0,0), 'D'), 0.4) # # make (action, prob) entry too long. # with self.assertRaises(ValueError): # piD = {(0, 0):[('R',0.6,0.4), ('D',0.4,0.6)] } # self.P.set_policy_from_piD( piD ) def test_learn_all_s_and_a(self): """test learn all s and a""" self.P.learn_all_states_and_actions_from_env(self.gridworld) def test_initialize_to_random(self): """test initialize to random""" self.P.intialize_policy_to_random(env=self.gridworld) apL = self.P.get_list_of_all_action_desc_prob((0, 2), incl_zero_prob=True) pL = [p for (adesc, p) in apL] self.assertEqual(sorted(pL), [0.0, 0.0, 1.0]) def test_iterate_adesc_p(self): """test iterate adesc p""" apL = [] for (a_desc, p) in self.P.iter_policy_ap_for_state( (0, 0), incl_zero_prob=False): apL.append((a_desc, p)) self.assertIn(('R', 0.5), apL) self.assertIn(('D', 0.5), apL) self.assertNotIn(('U', 0.5), apL) def test_iterate_all_states(self): """test iterate all states""" sL = [] for s_hash in self.P.iter_all_policy_states(): sL.append(s_hash) sL.sort() self.assertEqual(len(sL), 9) self.assertEqual(sL[0], (0, 0)) self.assertEqual(sL[-1], (2, 3)) def test_get_single_action(self): """test get single action""" a_desc = self.P.get_single_action((0, 0)) self.assertIn(a_desc, ('R', 'D')) a_desc = self.P.get_single_action((99, 99)) self.assertEqual(a_desc, None)
a_desc=self.A[self.tau], delta=delta) if __name__ == "__main__": # pragma: no cover from introrl.mdp_data.simple_grid_world import get_gridworld from introrl.policy import Policy from introrl.agent_supt.epsilon_calc import EpsilonGreedy from introrl.agent_supt.episode_maker import make_episode from introrl.agent_supt.action_value_coll import ActionValueColl gridworld = get_gridworld() sv = ActionValueColl(gridworld) pi = Policy(environment=gridworld) pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict()) #pi.summ_print() eg = EpsilonGreedy(epsilon=0.5, const_epsilon=True, half_life=200, N_episodes_wo_decay=0) episode_obj = make_episode((2, 0), pi, gridworld, eps_greedy=None) """environment, Nsteps=3, policy=None, episode_obj=None, terminal_set=None, max_steps=sys.maxsize, eps_greedy=None"""
def setUp(self): unittest.TestCase.setUp(self) self.gridworld = get_gridworld() self.P = Policy(environment=self.gridworld) self.P.intialize_policy_to_equiprobable(env=self.gridworld)
eps_obj.set_half_life_for_N_episodes(Nepisodes=NUM_EPISODES, epsilon_final=0.16666666666666) agent = SA_SemiGradAgent(environment=gridworld, update_type='qlearn', sa_linear_function=LazyProgrammerMaze(gridworld), learn_tracker=learn_tracker, gamma=0.9, alpha=alpha_obj, epsilon=eps_obj) for i in range(NUM_EPISODES): agent.run_episode((2, 0)) print() agent.summ_print() print('-' * 77) #learn_tracker.summ_print() #print('-'*77) agent.action_value_linfunc.summ_print(fmt_Q='%.4f') print('-' * 77) policy = Policy(environment=gridworld) for s_hash in gridworld.iter_all_action_states(): a_desc = agent.action_value_linfunc.get_best_eps_greedy_action( s_hash, epsgreedy_obj=None) policy.set_sole_action(s_hash, a_desc) policy.summ_print(environment=gridworld, verbosity=0)
from introrl.policy import Policy from introrl.black_box_sims.racetrack_1_sim import RaceTrack_1 import matplotlib import matplotlib.pyplot as plt import matplotlib.patches as mpatches RT = RaceTrack_1() #pi = Policy( environment=RT ) policyD = Policy().read_pickle_file('racetrack_1_sim') #pi.init_from_pickle_file( 'racetrack_1_sim' ) fig, ax = plt.subplots() for (j, i) in RT.racetrack_area: rect = mpatches.Rectangle((i - .5, j - .5), 1.0, 1.0, ec="none", color='blue', alpha=0.3) ax.add_patch(rect) for (j, i, _, _) in RT.starting_lineL: rect = mpatches.Rectangle((i - .5, j - .5), 1.0, 1.0, ec="none", color='yellow', alpha=1.)
import matplotlib.pyplot as plt from introrl.td_funcs.td0_prediction import td0_prediction from introrl.utils.running_ave import RunningAve from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction from introrl.policy import Policy from introrl.agent_supt.state_value_coll import StateValueColl from introrl.agent_supt.nstep_td_eval_walker import NStepTDWalker from introrl.mdp_data.random_walk_generic_mrp import get_random_walk from introrl.agent_supt.episode_maker import make_episode GAMMA=1.0 AVE_OVER = 100 rw_mrp = get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0) policy = Policy( environment=rw_mrp ) policy.intialize_policy_to_equiprobable() # should be equiprobable from above init already episode_obj = make_episode( 'C', policy, rw_mrp ) fig, ax = plt.subplots() # ---------------- set up true value data for RMS calc -------------------- true_valueD = {'C':0.0} # { 'Win':0.0, 'Lose':0.0} #print('rw_mrp.get_num_states() = ',rw_mrp.get_num_states()) delta = 2.0 / (rw_mrp.get_num_states()-1) Nsides = int( rw_mrp.get_num_states() / 2) - 1 d = 0.0 for i in range(1, Nsides+1 ):
from introrl.mdp_data.slippery_cleaning_robot import get_robot gridworld = get_robot() if 1: policy, state_value = dp_value_iteration( gridworld, do_summ_print=True,fmt_V='%.3f', max_iter=1000, err_delta=0.001, gamma=1.0) print('_'*55) score = gridworld.get_policy_score( policy, start_state_hash=None, step_limit=1000) print('Policy Score =', score, ' = (r_sum, n_steps, msg)') else: pi = Policy( environment=gridworld ) pi.set_policy_from_piD( gridworld.get_default_policy_desc_dict() ) sv = StateValues( gridworld ) sv.init_Vs_to_zero() dp_policy_evaluation( pi, sv, max_iter=1000, err_delta=0.001, gamma=.985) #sv.summ_print( fmt_V='%.3f', show_states=False ) pi.summ_print( environment=gridworld, verbosity=0, show_env_states=False ) print( gridworld.get_info() )