def summ_print(self, long=False, time_stamp=None): # pragma: no cover Model.summ_print(self, long=long) if time_stamp is None: """approximate time_stamp with largest model time_stamp""" time_stamp = 0 for t in self.state_action_time_stampD.values(): time_stamp = max(t, time_stamp) # get all states and figure out formatting sL = sorted( [s_hash for s_hash in self.define_statesD.keys()], key=NaturalOrStrKey ) max_len = max(6, max([len( str(s) ) for s in sL])) fmt = '%' + '%is'%max_len # get all actions for each state and figure out formatting astrL = [RSA.get_action_desc_str() for RSA in self.define_statesD.values()] max_a_len = max(6, max([len( str(a) ) for a in astrL])) fmt_a = '%' + '%is'%max_a_len max_a2_len = 0 max_det_len = 0 for s_hash in sL: RSA = self.define_statesD[s_hash] aL = [a_desc for a_desc in RSA.action_countD.keys()] max_a2_len = max(max_a2_len, max( [len(a) for a in aL] )) max_det_len = max(max_det_len, len(RSA.get_state_deterministic_desc().strip()) ) fmt_a2 = '%' + '%is'%max_a2_len fmt_det ='%-' + '%is'%max_det_len print('___________________________________________________') print(' State/Action TimeStamps ') print('___________________________________________________') for s_hash in sL: RSA = self.define_statesD[s_hash] aL = sorted( [a_desc for a_desc in RSA.action_countD.keys()], key=NaturalOrStrKey ) #print('aL =',aL, type(aL)) # self.state_action_time_stampD = {} # index=(s_hash,a_desc), value=time_stamp tstampL = [ fmt_a2%str(a)+'=%i'%(time_stamp - self.state_action_time_stampD[(s_hash,a)],) for a in aL ] print( fmt%str(s_hash), fmt_a%RSA.get_action_desc_str(), '...', fmt_det%RSA.get_state_deterministic_desc().strip(),' Age:', ', '.join(tstampL) )
if __name__ == "__main__": # pragma: no cover import time import os, sys from introrl.agent_supt.model import Model from introrl.environments.env_baseline import EnvBaseline from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.utils import pickle_esp start_time = time.time() RW = RandomWalk_1000Simulation() #RW.layout.s_hash_print( none_str='*' ) get_sim = Model( RW, build_initial_model=True ) get_sim.collect_transition_data( num_det_calls=100, num_stoic_calls=10000 ) RW.layout.s_hash_print() #get_sim.num_calls_layout_print() #get_sim.min_num_calls_layout_print() env = EnvBaseline( s_hash_rowL=RW.s_hash_rowL, x_axis_label=RW.x_axis_label, y_axis_label=RW.y_axis_label ) get_sim.add_all_data_to_an_environment( env ) policy, state_value = dp_value_iteration( env, do_summ_print=True, fmt_V='%.3f', fmt_R='%.1f',
return lim_stateL if __name__ == "__main__": # pragma: no cover import time import os, sys from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.environments.env_baseline import EnvBaseline from introrl.agent_supt.model import Model start_time = time.time() BJ = BlackJackSimulation() get_sim = Model(BJ, build_initial_model=True) # if there's a pickle file, read it fname = os.path.split(__file__)[-1].split('.')[ 0] # use file prefix for pickle file print('Pickle File Name Prefix:', fname) if not get_sim.read_pickle_file(fname): get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=10000) #get_sim.collect_transition_data( num_det_calls=10, num_stoic_calls=10000 ) print('Total recorded actions Before:', "{:,}".format(get_sim.total_num_action_data_points()))
if sn_count == len(snD): print() print('____'+'_'*len(header)) if __name__ == "__main__": # pragma: no cover from introrl.agent_supt.model import Model from introrl.mdp_data.simple_grid_world import get_gridworld gridworld = get_gridworld() get_sim = Model( gridworld, build_initial_model=True ) # ---------- make a few stochastic to test summ_print #get_sim.define_statesD[s_hash].save_action_results( a_desc, sn_hash, reward_val) # make just the reward stochastic get_sim.define_statesD[(0, 2)].save_action_results( 'R', (0,3), 2.0) # make the action stochastic get_sim.define_statesD[(1,0)].save_action_results( 'U', 'XXX', 0.0) # make both the action and reward stochastic get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.0) get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.2)
if __name__ == "__main__": # pragma: no cover import time import os, sys from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.environments.env_baseline import EnvBaseline from introrl.agent_supt.model import Model from introrl.utils import pickle_esp start_time = time.time() CR = CarRentalSimulation() get_sim = Model(CR, build_initial_model=True) get_sim.collect_transition_data(num_det_calls=50, num_stoic_calls=100000) print('Total recorded actions Before:', "{:,}".format(get_sim.total_num_action_data_points())) CR.layout.s_hash_print() get_sim.num_calls_layout_print(row_tickL=[c for c in ' First Location'], const_col_w=True, x_axis_label='Second Location', none_str='*') get_sim.min_num_calls_layout_print( row_tickL=[c for c in ' First Location'], const_col_w=True,
def __init__(self, env_interface, build_initial_model=False): # Interface (can be sim or env) # add dictionary to track time_stamp self.state_action_time_stampD = {} # index=(s_hash,a_desc), value=time_stamp Model.__init__(self, env_interface, build_initial_model=build_initial_model )
return ['U', 'D', 'R', 'L'] if __name__ == "__main__": # pragma: no cover import time import os, sys from introrl.td_funcs.qlearning_epsilon_greedy import qlearning_epsilon_greedy from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy from introrl.agent_supt.model import Model bmaze = BlockingMaze() bmaze.open_gate_R() bmaze.close_gate_L() env = Model(bmaze, build_initial_model=True) env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000) env.summ_print(long=False) bmaze.layout.s_hash_print(none_str='*') bmaze.open_gate_L() bmaze.close_gate_R() env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000) env.summ_print(long=False) policy, action_value = \ sarsa_epsilon_greedy( bmaze, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment.
if __name__ == "__main__": # pragma: no cover import time import os, sys from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.environments.env_baseline import EnvBaseline #from introrl.black_boxes.collect_sim_data import CollectSimData from introrl.agent_supt.model import Model start_time = time.time() s_hash_rowL = ((0, 1, 2, 3, 4), ) CR = Simulation(s_hash_rowL=s_hash_rowL) #get_sim = CollectSimData( CR ) get_sim = Model(CR, build_initial_model=True) # if there's a pickle file, read it fname = os.path.split(__file__)[-1].split('.')[ 0] # use file prefix for pickle file print('Pickle File Name Prefix:', fname) if not get_sim.read_pickle_file(fname): get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=1000) print('Total recorded actions Before:', "{:,}".format(get_sim.total_num_action_data_points())) get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=100) print('Total recorded actions After:', "{:,}".format(get_sim.total_num_action_data_points()))
import time from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.environments.env_baseline import EnvBaseline from introrl.agent_supt.model import Model from introrl.utils import pickle_esp from introrl.black_box_sims.blackjack_sim import BlackJackSimulation start_time = time.time() BJ = BlackJackSimulation() get_sim = Model(BJ, build_initial_model=True) get_sim.collect_transition_data(num_det_calls=50, num_stoic_calls=100000) BJ.layout.s_hash_print() get_sim.num_calls_layout_print() get_sim.min_num_calls_layout_print() print('got sim data') print('_' * 55) env = EnvBaseline(s_hash_rowL=BJ.s_hash_rowL, x_axis_label=BJ.x_axis_label, y_axis_label=BJ.y_axis_label) get_sim.add_all_data_to_an_environment(env) print('built environment') print('_' * 55)
def __init__( self, environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', do_summ_print=True, show_last_change=True, pcent_progress_print=10, show_banner=True, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, # can be constant or EpsilonGreedy object alpha=0.1): # can be constant or Alpha object """ ... GIVEN AN ENVIRONMENT ... Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a) Each action is forced to be a DETERMINISTIC action leading to one state and reward. (If the next state or reward changes, only the new values will be considered) attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute. """ self.environment = environment self.learn_tracker = learn_tracker self.save_pickle_file = save_pickle_file self.do_summ_print = do_summ_print self.show_last_change = show_last_change self.pcent_progress_print = pcent_progress_print self.gamma = gamma self.iteration_prints = iteration_prints self.max_episode_steps = max_episode_steps self.num_episodes = 0 self.num_updates = 0 # if input epsilon is a float, use it to create an EpsilonGreedy object if type(epsilon) == type(0.1): self.epsilon_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True) else: self.epsilon_obj = epsilon # if input alpha is a float, use it to create an Alpha object if type(alpha) == type(0.1): self.alpha_obj = Alpha(alpha=alpha, const_alpha=True) else: self.alpha_obj = alpha # create the action_value_coll for the environment. if initial_action_value_coll is None: self.action_value_coll = ActionValueColl(environment, init_val=initial_Qsa) else: self.action_value_coll = initial_action_value_coll if read_pickle_file: self.action_value_coll.init_from_pickle_file(read_pickle_file) # initialize the model that will build from experience # do not build full model description on Model init, states not visited # by the RL portion will have no returns values. self.model = Model(environment, build_initial_model=False) #for s_hash, aD in self.action_value_coll.QsaD.items(): # for a_desc, Q in aD.items(): # self.model.add_action( s_hash, a_desc ) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) self.epsilon_obj.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) self.alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center')
class DynaQAgent(object): """ DynaQ Agent. """ def __init__( self, environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', do_summ_print=True, show_last_change=True, pcent_progress_print=10, show_banner=True, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, # can be constant or EpsilonGreedy object alpha=0.1): # can be constant or Alpha object """ ... GIVEN AN ENVIRONMENT ... Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a) Each action is forced to be a DETERMINISTIC action leading to one state and reward. (If the next state or reward changes, only the new values will be considered) attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute. """ self.environment = environment self.learn_tracker = learn_tracker self.save_pickle_file = save_pickle_file self.do_summ_print = do_summ_print self.show_last_change = show_last_change self.pcent_progress_print = pcent_progress_print self.gamma = gamma self.iteration_prints = iteration_prints self.max_episode_steps = max_episode_steps self.num_episodes = 0 self.num_updates = 0 # if input epsilon is a float, use it to create an EpsilonGreedy object if type(epsilon) == type(0.1): self.epsilon_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True) else: self.epsilon_obj = epsilon # if input alpha is a float, use it to create an Alpha object if type(alpha) == type(0.1): self.alpha_obj = Alpha(alpha=alpha, const_alpha=True) else: self.alpha_obj = alpha # create the action_value_coll for the environment. if initial_action_value_coll is None: self.action_value_coll = ActionValueColl(environment, init_val=initial_Qsa) else: self.action_value_coll = initial_action_value_coll if read_pickle_file: self.action_value_coll.init_from_pickle_file(read_pickle_file) # initialize the model that will build from experience # do not build full model description on Model init, states not visited # by the RL portion will have no returns values. self.model = Model(environment, build_initial_model=False) #for s_hash, aD in self.action_value_coll.QsaD.items(): # for a_desc, Q in aD.items(): # self.model.add_action( s_hash, a_desc ) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) self.epsilon_obj.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) self.alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center') def run_episode(self, start_state, Nplanning_loops=5, iter_sarsn=None): """ Run a single episode of Dyna-Q algorithm If iter_sarsn is input, use it instead of action_value_coll calculations. (Note: the start_state should NOT be in terminal_set if iter_sarsn is input.) """ # increment episode counters self.num_episodes += 1 self.epsilon_obj.inc_N_episodes() self.alpha_obj.inc_N_episodes() if self.learn_tracker is not None: self.learn_tracker.add_new_episode() # do dyna_q loops until sn_hash in terminal_set s_hash = start_state n_steps_in_episode = 1 while s_hash not in self.environment.terminal_set: if iter_sarsn is None: # get best epsilon-greedy action a_desc = self.action_value_coll.get_best_eps_greedy_action( \ s_hash, epsgreedy_obj=self.epsilon_obj ) # check for bad action value if a_desc is None: print('break for a_desc==None at s_hash=%s' % str(s_hash)) break # get next state and reward sn_hash, reward = self.environment.get_action_snext_reward( s_hash, a_desc) else: # retracing an existing episode s_hash, a_desc, reward, sn_hash = next(iter_sarsn) if self.learn_tracker is not None: self.learn_tracker.add_sarsn_to_current_episode( s_hash, a_desc, reward, sn_hash) if sn_hash is None: print('break for sn_hash==None, #steps=', n_steps_in_episode, ' s_hash=%s' % str(s_hash), ' a_desc=%s' % str(a_desc)) break # do RL update of Q(s,a) value self.action_value_coll.qlearning_update(s_hash=s_hash, a_desc=a_desc, sn_hash=sn_hash, alpha=self.alpha_obj(), gamma=self.gamma, reward=reward) self.num_updates += 1 # give the above experience to the model self.model.add_action(s_hash, a_desc) # force DETERMINISTIC next state and reward. self.model.save_deterministic_action_results(s_hash, a_desc, sn_hash, reward_val=reward) # do NOT use simple save_action_results... it allows NON-DETERMINISTIC next state. #self.model.save_action_results( s_hash, a_desc, sn_hash, reward_val=reward) # --------------------------------- Planning Loop ------------------------ # make Nplanning_loops calls to model for n_plan in range(Nplanning_loops): s_model = self.model.get_random_state() #print(s_model, end=' ') # vanilla DynaQ a_model = self.model.get_random_action(s_model) #sn_model, r_model = self.environment.get_action_snext_reward( s_model, a_model ) sn_model, r_model = self.model.get_sample_sn_r( s_model, a_model) # update for the DynaQ results. self.action_value_coll.qlearning_update(s_hash=s_model, a_desc=a_model, sn_hash=sn_model, alpha=self.alpha_obj(), gamma=self.gamma, reward=r_model) self.num_updates += 1 # keep a lid on the max number of episode steps. if n_steps_in_episode >= self.max_episode_steps: break # get ready for next loop n_steps_in_episode += 1 s_hash = sn_hash #print(n_steps_in_episode, end=' ') def summ_print(self, long=True): # pragma: no cover """Show State objects in sorted state_hash order.""" print('___ Policy Evaluation Agent Summary ___') print(' Environment = %s' % self.environment.name) print(' Number of Episodes = %g' % self.num_episodes)