def __init__(self, environment, Nsteps=3, epsilon=0.1, init_q_val=0.0, terminal_set=None, max_steps=sys.maxsize): self.environment = environment self.av_coll = ActionValueColl(environment, init_val=init_q_val) # assume a constant epsilon for now. self.epsgreedy_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True, half_life=200, N_episodes_wo_decay=0) if terminal_set is None: self.terminal_set = environment.terminal_set else: self.terminal_set = terminal_set self.Nsteps = Nsteps self.max_steps = max_steps self.clear() # initialize the (s,a,r) data structures, t=0, T=inf.
G - av_coll.get_val(self.S[self.tau], self.A[self.tau])) av_coll.delta_update(s_hash=self.S[self.tau], a_desc=self.A[self.tau], delta=delta) if __name__ == "__main__": # pragma: no cover from introrl.mdp_data.simple_grid_world import get_gridworld from introrl.policy import Policy from introrl.agent_supt.epsilon_calc import EpsilonGreedy from introrl.agent_supt.episode_maker import make_episode from introrl.agent_supt.action_value_coll import ActionValueColl gridworld = get_gridworld() sv = ActionValueColl(gridworld) pi = Policy(environment=gridworld) pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict()) #pi.summ_print() eg = EpsilonGreedy(epsilon=0.5, const_epsilon=True, half_life=200, N_episodes_wo_decay=0) episode_obj = make_episode((2, 0), pi, gridworld, eps_greedy=None) """environment, Nsteps=3, policy=None, episode_obj=None, terminal_set=None,
def sarsa_epsilon_greedy( environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g', pcent_progress_print=10, show_banner = True, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, const_epsilon=True, epsilon_half_life=200, alpha=0.1, const_alpha=True, alpha_half_life=200, N_episodes_wo_decay=0): """ ... GIVEN AN ENVIRONMENT ... apply SARSA Temporal Difference to find the OPTIMAL POLICY and STATE VALUES Returns: Policy and ActionValueColl objects Use Episode Discounted Returns to find V(s), State-Value Function Terminates when abserr < max_abserr Assume that V(s), action_value_coll, has been initialized prior to call. Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value_coll OBJECTS. """ # create EpsilonGreedy, Alpha and ActionValueColl objects eg = EpsilonGreedy(epsilon=epsilon, const_epsilon=const_epsilon, half_life=epsilon_half_life, N_episodes_wo_decay=N_episodes_wo_decay) alpha_obj = Alpha( alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life ) if initial_action_value_coll is None: action_value_coll = ActionValueColl( environment, init_val=initial_Qsa ) else: action_value_coll = initial_action_value_coll #action_value_coll.summ_print() num_s_hash = len( environment.get_all_action_state_hashes() ) if read_pickle_file: action_value_coll.init_from_pickle_file( read_pickle_file ) if do_summ_print: print('================== EPSILON GREEDY DEFINED AS ========================') eg.summ_print() print('================== LEARNING RATE DEFINED AS ========================') alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i SARSA Epsilon Greedy Episodes'%max_num_episodes +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, gamma, alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center') # Iterate over a list of known possible start states if use_list_of_start_states: loop_stateL = environment.limited_start_state_list() else: #loop_stateL = [ random.choice( environment.limited_start_state_list() ) ] loop_stateL = [ environment.start_state_hash ] if show_banner: print('======================= Iterating over Start States ==================================') print( loop_stateL ) print('======================================================================================') # set counter and flag episode_loop_counter = 0 keep_looping = True progress_str = '' while (episode_loop_counter<=max_num_episodes-1) and keep_looping : keep_looping = False abserr = 0.0 # calculated below as part of termination criteria Nterminal_episodes = set() # tracks if start_hash got to terminal_set or max_num_episodes for start_hash in loop_stateL: episode_loop_counter += 1 if episode_loop_counter > max_num_episodes: break if learn_tracker is not None: learn_tracker.add_new_episode() s_hash = start_hash a_desc = action_value_coll.get_best_eps_greedy_action( s_hash, epsgreedy_obj=eg ) for n_episode_steps in range( max_episode_steps ): # Begin an episode if a_desc is None: Nterminal_episodes.add( start_hash ) print('break for a_desc==None') break else: sn_hash, reward = environment.get_action_snext_reward( s_hash, a_desc ) if learn_tracker is not None: learn_tracker.add_sarsn_to_current_episode( s_hash, a_desc, reward, sn_hash) if sn_hash is None: Nterminal_episodes.add( start_hash ) print('break for sn_hash==None') break else: an_desc = action_value_coll.get_best_eps_greedy_action( sn_hash, epsgreedy_obj=eg ) action_value_coll.sarsa_update( s_hash=s_hash, a_desc=a_desc, alpha=alpha_obj(), gamma=gamma, sn_hash=sn_hash, an_desc=an_desc, reward=reward) if sn_hash in environment.terminal_set: Nterminal_episodes.add( start_hash ) if (n_episode_steps==0) and (num_s_hash>2): print('1st step break for sn_hash in terminal_set', sn_hash, ' s_hash=%s'%str(s_hash), ' a_desc=%s'%str(a_desc)) break s_hash = sn_hash a_desc = an_desc # increment episode counter on EpsilonGreedy and Alpha objects eg.inc_N_episodes() alpha_obj.inc_N_episodes() abserr = action_value_coll.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if episode_loop_counter < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(episode_loop_counter) / float(max_num_episodes) if pcent_progress_print > 0: out_str = '%3i%%'%( pcent_progress_print*(int(pc_done/float(pcent_progress_print)) ) ) else: out_str = progress_str if out_str != progress_str: #score = environment.get_policy_score( policy=policy, start_state_hash=None, step_limit=1000) #print(out_str, ' score=%s'%str(score), ' = (r_sum, n_steps, msg)', end=' ') print(out_str, end=' ') print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL)) progress_str = out_str #print() policy = action_value_coll.get_policy() if do_summ_print: s = '' if episode_loop_counter >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print( 'Exited Epsilon Greedy, TD(0) Value Iteration', s ) print( ' # episodes =', episode_loop_counter, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes ) print( ' gamma =', gamma ) print( ' estimated err =', abserr ) print( ' Error limit =', max_abserr ) print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL)) action_value_coll.summ_print(show_last_change=show_last_change, fmt_Q=fmt_Q ) policy.summ_print( environment=environment, verbosity=0, show_env_states=False ) try: # sims may not have a layout_print environment.layout_print( vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass print('================== EPSILON GREEDY DEFINED AS ========================') eg.summ_print() print('================== LEARNING RATE DEFINED AS ========================') alpha_obj.summ_print() if save_pickle_file: policy.save_to_pickle_file( save_pickle_file ) action_value_coll.save_to_pickle_file( save_pickle_file ) return policy, action_value_coll #, steps_per_episodeL, reward_sum_per_episodeL
GAMMA = 0.9 NSTEPS = 8 rw_mrp = get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0) if DO_QSTAR: EPSILON = 0.1 walker = NStepSarsaQStarFinder(rw_mrp, Nsteps=NSTEPS, epsilon=EPSILON) av_coll = walker.av_coll else: policy = Policy(environment=rw_mrp) walker = NStepSarsaWalker(rw_mrp, Nsteps=NSTEPS, policy=policy) av_coll = ActionValueColl(rw_mrp, init_val=0.0) #walker.av_coll.summ_print( fmt_Q='%.3f', none_str='*', show_states=True, show_last_change=True, show_policy=True) print('<>' * 60) for _ in range(200): if DO_QSTAR: walker.do_sarsa_action_value_updates(alpha=ALPHA, gamma=GAMMA, start_state_hash='C') else: walker.do_sarsa_action_value_updates(av_coll, alpha=ALPHA, gamma=GAMMA, start_state_hash='C')
class NStepSarsaQStarFinder(object): """ Find the optimal policy by updating a ActionValueColl according to the n-step Sarsa algorithm from page 147 of Sutton&Barto 2nd Ed. When a terminal state is reached, or maximum number of steps is reached, do the final updates with ever-shortening from Nsteps, updates. Assume an eps_greedy for policy steps. Will call "eps_greedy.inc_N_episodes()" for any non-constant epsilon calcs. """ def __init__(self, environment, Nsteps=3, epsilon=0.1, init_q_val=0.0, terminal_set=None, max_steps=sys.maxsize): self.environment = environment self.av_coll = ActionValueColl(environment, init_val=init_q_val) # assume a constant epsilon for now. self.epsgreedy_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True, half_life=200, N_episodes_wo_decay=0) if terminal_set is None: self.terminal_set = environment.terminal_set else: self.terminal_set = terminal_set self.Nsteps = Nsteps self.max_steps = max_steps self.clear() # initialize the (s,a,r) data structures, t=0, T=inf. def clear(self): # The (s,a,r) data will be in circular lists such that the index will wrap-around. self.S = CircularList([0] * (self.Nsteps + 1)) self.A = CircularList([0] * (self.Nsteps + 1)) self.R = CircularList([0] * (self.Nsteps + 1)) self.t = 0 # current time value self.T = sys.maxsize # T initialized to infinity self.tau = 0 # will be position getting update def initialize(self, start_state_hash=None): """ initialize values at self.t (If start_state_hash is input use it, otherwise use environment.start_state_hash) """ self.clear() # initialize the (s,a,r) data structures, t=0, T=inf. if start_state_hash is None: start_state_hash = self.environment.start_state_hash self.S[0] = start_state_hash a_desc = self.av_coll.get_best_eps_greedy_action( start_state_hash, epsgreedy_obj=self.epsgreedy_obj) if a_desc is None: self.T = 0 # ending before we start self.A[0] = a_desc sn_hash, reward = self.environment.get_action_snext_reward( self.S[0], self.A[0]) self.S[1] = sn_hash self.R[1] = reward if (sn_hash is None) or (sn_hash in self.terminal_set): self.T = 1 # ends pretty quickly else: # add next action, A[1] a_desc = self.av_coll.get_best_eps_greedy_action( self.S[1], epsgreedy_obj=self.epsgreedy_obj) self.A[1] = a_desc if a_desc is None: self.T = 1 # ending quickly self.tau = self.t - self.Nsteps + 1 def add_step(self): """ Add a step from the ActionValueColl and add it to the lists. Assume that self.t has been properly set. """ a_desc = self.A[self.t] if not a_desc is None: sn_hash, reward = self.environment.get_action_snext_reward( self.S[self.t], self.A[self.t]) self.S[self.t + 1] = sn_hash self.R[self.t + 1] = reward if (sn_hash is None) or (sn_hash in self.terminal_set): self.T = self.t + 1 # terminal else: # add next action a_desc = self.av_coll.get_best_eps_greedy_action( self.S[self.t + 1], epsgreedy_obj=self.epsgreedy_obj) self.A[self.t + 1] = a_desc if a_desc is None: self.T = self.t + 1 # terminal def do_sarsa_action_value_updates( self, alpha=0.1, gamma=1.0, start_state_hash=None): # only used for policy, not episode_obj """ Given an initialized NStepSarsaQStarFinder, Iterate through the returns for the episode Update the ActionValueColl, av_coll as part of the episode iteration. NOTE: The ActionValueColl will be updated as part of this method. """ self.initialize(start_state_hash=start_state_hash) # should have t=0, T=infinity, tau=negative total_num_steps = 0 while self.tau < self.T - 1: total_num_steps += 1 if total_num_steps >= self.max_steps: break self.t += 1 if self.t < self.T: # Take an action according to policy (or episode_obj) self.add_step() self.tau = self.t - self.Nsteps + 1 if self.tau >= 0: # ------------------------------ G = 0.0 g_pow = 1.0 # gamma**n #print(' R=',self.R) for i in range(self.tau + 1, min(self.tau + self.Nsteps, self.T) + 1): G += g_pow * self.R[i] g_pow *= gamma #print(' at i=%i, R[i]=%g'%(i, self.R[i])) if self.tau + self.Nsteps < self.T: gpow = gamma**self.Nsteps G += g_pow * self.av_coll.get_val( self.S[self.tau + self.Nsteps], self.A[self.tau + self.Nsteps]) delta = alpha * (G - self.av_coll.get_val( self.S[self.tau], self.A[self.tau])) self.av_coll.delta_update(s_hash=self.S[self.tau], a_desc=self.A[self.tau], delta=delta)
def __init__( self, environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', do_summ_print=True, show_last_change=True, pcent_progress_print=10, show_banner=True, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, # can be constant or EpsilonGreedy object alpha=0.1): # can be constant or Alpha object """ ... GIVEN AN ENVIRONMENT ... Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a) Each action is forced to be a DETERMINISTIC action leading to one state and reward. (If the next state or reward changes, only the new values will be considered) attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute. """ self.environment = environment self.learn_tracker = learn_tracker self.save_pickle_file = save_pickle_file self.do_summ_print = do_summ_print self.show_last_change = show_last_change self.pcent_progress_print = pcent_progress_print self.gamma = gamma self.iteration_prints = iteration_prints self.max_episode_steps = max_episode_steps self.num_episodes = 0 self.num_updates = 0 # if input epsilon is a float, use it to create an EpsilonGreedy object if type(epsilon) == type(0.1): self.epsilon_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True) else: self.epsilon_obj = epsilon # if input alpha is a float, use it to create an Alpha object if type(alpha) == type(0.1): self.alpha_obj = Alpha(alpha=alpha, const_alpha=True) else: self.alpha_obj = alpha # create the action_value_coll for the environment. if initial_action_value_coll is None: self.action_value_coll = ActionValueColl(environment, init_val=initial_Qsa) else: self.action_value_coll = initial_action_value_coll if read_pickle_file: self.action_value_coll.init_from_pickle_file(read_pickle_file) # initialize the model that will build from experience # do not build full model description on Model init, states not visited # by the RL portion will have no returns values. self.model = Model(environment, build_initial_model=False) #for s_hash, aD in self.action_value_coll.QsaD.items(): # for a_desc, Q in aD.items(): # self.model.add_action( s_hash, a_desc ) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) self.epsilon_obj.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) self.alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center')
class DynaQAgent(object): """ DynaQ Agent. """ def __init__( self, environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', do_summ_print=True, show_last_change=True, pcent_progress_print=10, show_banner=True, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, # can be constant or EpsilonGreedy object alpha=0.1): # can be constant or Alpha object """ ... GIVEN AN ENVIRONMENT ... Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a) Each action is forced to be a DETERMINISTIC action leading to one state and reward. (If the next state or reward changes, only the new values will be considered) attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute. """ self.environment = environment self.learn_tracker = learn_tracker self.save_pickle_file = save_pickle_file self.do_summ_print = do_summ_print self.show_last_change = show_last_change self.pcent_progress_print = pcent_progress_print self.gamma = gamma self.iteration_prints = iteration_prints self.max_episode_steps = max_episode_steps self.num_episodes = 0 self.num_updates = 0 # if input epsilon is a float, use it to create an EpsilonGreedy object if type(epsilon) == type(0.1): self.epsilon_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True) else: self.epsilon_obj = epsilon # if input alpha is a float, use it to create an Alpha object if type(alpha) == type(0.1): self.alpha_obj = Alpha(alpha=alpha, const_alpha=True) else: self.alpha_obj = alpha # create the action_value_coll for the environment. if initial_action_value_coll is None: self.action_value_coll = ActionValueColl(environment, init_val=initial_Qsa) else: self.action_value_coll = initial_action_value_coll if read_pickle_file: self.action_value_coll.init_from_pickle_file(read_pickle_file) # initialize the model that will build from experience # do not build full model description on Model init, states not visited # by the RL portion will have no returns values. self.model = Model(environment, build_initial_model=False) #for s_hash, aD in self.action_value_coll.QsaD.items(): # for a_desc, Q in aD.items(): # self.model.add_action( s_hash, a_desc ) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) self.epsilon_obj.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) self.alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center') def run_episode(self, start_state, Nplanning_loops=5, iter_sarsn=None): """ Run a single episode of Dyna-Q algorithm If iter_sarsn is input, use it instead of action_value_coll calculations. (Note: the start_state should NOT be in terminal_set if iter_sarsn is input.) """ # increment episode counters self.num_episodes += 1 self.epsilon_obj.inc_N_episodes() self.alpha_obj.inc_N_episodes() if self.learn_tracker is not None: self.learn_tracker.add_new_episode() # do dyna_q loops until sn_hash in terminal_set s_hash = start_state n_steps_in_episode = 1 while s_hash not in self.environment.terminal_set: if iter_sarsn is None: # get best epsilon-greedy action a_desc = self.action_value_coll.get_best_eps_greedy_action( \ s_hash, epsgreedy_obj=self.epsilon_obj ) # check for bad action value if a_desc is None: print('break for a_desc==None at s_hash=%s' % str(s_hash)) break # get next state and reward sn_hash, reward = self.environment.get_action_snext_reward( s_hash, a_desc) else: # retracing an existing episode s_hash, a_desc, reward, sn_hash = next(iter_sarsn) if self.learn_tracker is not None: self.learn_tracker.add_sarsn_to_current_episode( s_hash, a_desc, reward, sn_hash) if sn_hash is None: print('break for sn_hash==None, #steps=', n_steps_in_episode, ' s_hash=%s' % str(s_hash), ' a_desc=%s' % str(a_desc)) break # do RL update of Q(s,a) value self.action_value_coll.qlearning_update(s_hash=s_hash, a_desc=a_desc, sn_hash=sn_hash, alpha=self.alpha_obj(), gamma=self.gamma, reward=reward) self.num_updates += 1 # give the above experience to the model self.model.add_action(s_hash, a_desc) # force DETERMINISTIC next state and reward. self.model.save_deterministic_action_results(s_hash, a_desc, sn_hash, reward_val=reward) # do NOT use simple save_action_results... it allows NON-DETERMINISTIC next state. #self.model.save_action_results( s_hash, a_desc, sn_hash, reward_val=reward) # --------------------------------- Planning Loop ------------------------ # make Nplanning_loops calls to model for n_plan in range(Nplanning_loops): s_model = self.model.get_random_state() #print(s_model, end=' ') # vanilla DynaQ a_model = self.model.get_random_action(s_model) #sn_model, r_model = self.environment.get_action_snext_reward( s_model, a_model ) sn_model, r_model = self.model.get_sample_sn_r( s_model, a_model) # update for the DynaQ results. self.action_value_coll.qlearning_update(s_hash=s_model, a_desc=a_model, sn_hash=sn_model, alpha=self.alpha_obj(), gamma=self.gamma, reward=r_model) self.num_updates += 1 # keep a lid on the max number of episode steps. if n_steps_in_episode >= self.max_episode_steps: break # get ready for next loop n_steps_in_episode += 1 s_hash = sn_hash #print(n_steps_in_episode, end=' ') def summ_print(self, long=True): # pragma: no cover """Show State objects in sorted state_hash order.""" print('___ Policy Evaluation Agent Summary ___') print(' Environment = %s' % self.environment.name) print(' Number of Episodes = %g' % self.num_episodes)