コード例 #1
0
    def __init__(self,
                 environment,
                 Nsteps=3,
                 epsilon=0.1,
                 init_q_val=0.0,
                 terminal_set=None,
                 max_steps=sys.maxsize):

        self.environment = environment
        self.av_coll = ActionValueColl(environment, init_val=init_q_val)

        # assume a constant epsilon for now.
        self.epsgreedy_obj = EpsilonGreedy(epsilon=epsilon,
                                           const_epsilon=True,
                                           half_life=200,
                                           N_episodes_wo_decay=0)

        if terminal_set is None:
            self.terminal_set = environment.terminal_set
        else:
            self.terminal_set = terminal_set

        self.Nsteps = Nsteps
        self.max_steps = max_steps

        self.clear()  # initialize the (s,a,r) data structures, t=0, T=inf.
コード例 #2
0
                    G - av_coll.get_val(self.S[self.tau], self.A[self.tau]))
                av_coll.delta_update(s_hash=self.S[self.tau],
                                     a_desc=self.A[self.tau],
                                     delta=delta)


if __name__ == "__main__":  # pragma: no cover

    from introrl.mdp_data.simple_grid_world import get_gridworld
    from introrl.policy import Policy
    from introrl.agent_supt.epsilon_calc import EpsilonGreedy
    from introrl.agent_supt.episode_maker import make_episode
    from introrl.agent_supt.action_value_coll import ActionValueColl

    gridworld = get_gridworld()
    sv = ActionValueColl(gridworld)

    pi = Policy(environment=gridworld)

    pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict())
    #pi.summ_print()

    eg = EpsilonGreedy(epsilon=0.5,
                       const_epsilon=True,
                       half_life=200,
                       N_episodes_wo_decay=0)

    episode_obj = make_episode((2, 0), pi, gridworld, eps_greedy=None)
    """environment, Nsteps=3, 
                 policy=None, episode_obj=None, 
                 terminal_set=None,
コード例 #3
0
def sarsa_epsilon_greedy( environment,  learn_tracker=None, # track progress of learning
                          initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                          initial_action_value_coll=None, # if input, use it.
                          read_pickle_file='', 
                          save_pickle_file='',
                          use_list_of_start_states=False, # use list OR single start state of environment.
                          do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                          pcent_progress_print=10,
                          show_banner = True,
                          max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, 
                          gamma=0.9,
                          iteration_prints=0,
                          max_episode_steps=sys.maxsize,
                          epsilon=0.1, const_epsilon=True, epsilon_half_life=200,
                          alpha=0.1, const_alpha=True, alpha_half_life=200,
                          N_episodes_wo_decay=0):
    """
    ... GIVEN AN ENVIRONMENT ... 
    apply SARSA Temporal Difference to find the OPTIMAL POLICY and STATE VALUES
    
    Returns: Policy and ActionValueColl objects
    
    Use Episode Discounted Returns to find V(s), State-Value Function
    
    Terminates when abserr < max_abserr
    
    Assume that V(s), action_value_coll, has been initialized prior to call.
    
    Assume environment attached to policy will have method "get_any_action_state_hash"
    in order to begin at any action state.
    
    CREATES BOTH policy AND action_value_coll OBJECTS.
    """
    
    # create EpsilonGreedy, Alpha and ActionValueColl objects
    eg = EpsilonGreedy(epsilon=epsilon, const_epsilon=const_epsilon, half_life=epsilon_half_life,
                       N_episodes_wo_decay=N_episodes_wo_decay)

    
    alpha_obj = Alpha( alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life )


    if initial_action_value_coll is None:
        action_value_coll = ActionValueColl( environment, init_val=initial_Qsa )
    else:
        action_value_coll = initial_action_value_coll
    #action_value_coll.summ_print()
    num_s_hash = len( environment.get_all_action_state_hashes() )

    if read_pickle_file:
        action_value_coll.init_from_pickle_file( read_pickle_file )
    
    if do_summ_print:
        print('================== EPSILON GREEDY DEFINED AS ========================')
        eg.summ_print()
        
        print('================== LEARNING RATE DEFINED AS ========================')
        alpha_obj.summ_print()
    
    if show_banner:
        s = 'Starting a Maximum of %i SARSA Epsilon Greedy Episodes'%max_num_episodes +\
            '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, gamma, alpha_obj() )
        banner(s, banner_char='', leftMargin=0, just='center')
        
    # Iterate over a list of known possible start states
    if use_list_of_start_states:
        loop_stateL = environment.limited_start_state_list()
    else:
        #loop_stateL = [ random.choice( environment.limited_start_state_list() ) ]
        loop_stateL = [ environment.start_state_hash ]
        
    if show_banner:
        print('======================= Iterating over Start States ==================================')
        print( loop_stateL )
        print('======================================================================================')

        
    # set counter and flag
    episode_loop_counter = 0
    keep_looping = True
    
    progress_str = ''
    while (episode_loop_counter<=max_num_episodes-1) and keep_looping :
            
        keep_looping = False
        abserr = 0.0 # calculated below as part of termination criteria
        Nterminal_episodes = set() # tracks if start_hash got to terminal_set or max_num_episodes
        
        for start_hash in loop_stateL:
            episode_loop_counter += 1
            if episode_loop_counter > max_num_episodes:
                break
            
            if learn_tracker is not None:
                learn_tracker.add_new_episode()
            
            s_hash = start_hash
            a_desc = action_value_coll.get_best_eps_greedy_action( s_hash, epsgreedy_obj=eg )
            
            for n_episode_steps in range( max_episode_steps ):
                
                # Begin an episode
                if a_desc is None:
                    Nterminal_episodes.add( start_hash )
                    print('break for a_desc==None')
                    break
                else:
                    sn_hash, reward = environment.get_action_snext_reward( s_hash, a_desc )
                    if learn_tracker is not None:
                        learn_tracker.add_sarsn_to_current_episode( s_hash, a_desc, 
                                                                    reward, sn_hash)
                    
                    if sn_hash is None:
                        Nterminal_episodes.add( start_hash )
                        print('break for sn_hash==None')
                        break
                    else:
                        an_desc = action_value_coll.get_best_eps_greedy_action( sn_hash, 
                                                                                epsgreedy_obj=eg )
            
                        action_value_coll.sarsa_update( s_hash=s_hash, a_desc=a_desc, 
                                                        alpha=alpha_obj(), gamma=gamma, 
                                                        sn_hash=sn_hash, an_desc=an_desc, 
                                                        reward=reward)
                        
                        if sn_hash in environment.terminal_set:
                            Nterminal_episodes.add( start_hash )
                            if (n_episode_steps==0) and (num_s_hash>2):
                                print('1st step break for sn_hash in terminal_set', sn_hash, 
                                      ' s_hash=%s'%str(s_hash), ' a_desc=%s'%str(a_desc))
                            break
                        s_hash = sn_hash
                        a_desc = an_desc
        
        # increment episode counter on EpsilonGreedy and Alpha objects
        eg.inc_N_episodes()
        alpha_obj.inc_N_episodes()
                
        abserr = action_value_coll.get_biggest_action_state_err()
        if abserr > max_abserr:
            keep_looping = True
            
        if episode_loop_counter < min_num_episodes:
            keep_looping = True # must loop for min_num_episodes at least
            
        pc_done = 100.0 * float(episode_loop_counter) / float(max_num_episodes)
        
        if pcent_progress_print > 0:
            out_str = '%3i%%'%( pcent_progress_print*(int(pc_done/float(pcent_progress_print)) ) )
        else:
            out_str = progress_str
        
        if out_str != progress_str:
            #score = environment.get_policy_score( policy=policy, start_state_hash=None, step_limit=1000)
            #print(out_str, ' score=%s'%str(score), ' = (r_sum, n_steps, msg)', end=' ')
            
            print(out_str, end=' ')
            print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL))
            progress_str = out_str
    #print()
    
    policy = action_value_coll.get_policy()
    
    if do_summ_print:
        s = ''
        if episode_loop_counter >= max_num_episodes:
            s = '   (NOTE: STOPPED ON MAX-ITERATIONS)'

        print( 'Exited Epsilon Greedy, TD(0) Value Iteration', s )
        print( '   # episodes      =', episode_loop_counter, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes )
        print( '   gamma           =', gamma )
        print( '   estimated err   =', abserr )
        print( '   Error limit     =', max_abserr )
        print( 'Nterminal episodes =', len(Nterminal_episodes),' of ', len(loop_stateL))
    
        action_value_coll.summ_print(show_last_change=show_last_change, fmt_Q=fmt_Q )
        policy.summ_print(  environment=environment, verbosity=0, show_env_states=False  )
        
        try: # sims may not have a layout_print
            environment.layout_print( vname='reward', fmt=fmt_R, show_env_states=False, none_str='*')
        except:
            pass

        print('================== EPSILON GREEDY DEFINED AS ========================')
        eg.summ_print()
        
        print('================== LEARNING RATE DEFINED AS ========================')
        alpha_obj.summ_print()

    if save_pickle_file:
        policy.save_to_pickle_file( save_pickle_file )
        action_value_coll.save_to_pickle_file( save_pickle_file )
        
    return policy, action_value_coll #, steps_per_episodeL, reward_sum_per_episodeL
コード例 #4
0
GAMMA = 0.9
NSTEPS = 8

rw_mrp = get_random_walk(Nside_states=9,
                         win_reward=1.0,
                         lose_reward=-1.0,
                         step_reward=0.0)

if DO_QSTAR:
    EPSILON = 0.1
    walker = NStepSarsaQStarFinder(rw_mrp, Nsteps=NSTEPS, epsilon=EPSILON)
    av_coll = walker.av_coll
else:
    policy = Policy(environment=rw_mrp)
    walker = NStepSarsaWalker(rw_mrp, Nsteps=NSTEPS, policy=policy)
    av_coll = ActionValueColl(rw_mrp, init_val=0.0)

#walker.av_coll.summ_print( fmt_Q='%.3f', none_str='*', show_states=True, show_last_change=True, show_policy=True)
print('<>' * 60)

for _ in range(200):
    if DO_QSTAR:
        walker.do_sarsa_action_value_updates(alpha=ALPHA,
                                             gamma=GAMMA,
                                             start_state_hash='C')
    else:
        walker.do_sarsa_action_value_updates(av_coll,
                                             alpha=ALPHA,
                                             gamma=GAMMA,
                                             start_state_hash='C')
コード例 #5
0
class NStepSarsaQStarFinder(object):
    """
    Find the optimal policy by updating a ActionValueColl according to
    the n-step Sarsa algorithm from page 147 of Sutton&Barto 2nd Ed.
        
    When a terminal state is reached, or maximum number of steps is reached,
    do the final updates with ever-shortening from Nsteps, updates.
    
    Assume an eps_greedy for policy steps.
    Will call "eps_greedy.inc_N_episodes()" for any non-constant epsilon calcs.
    """
    def __init__(self,
                 environment,
                 Nsteps=3,
                 epsilon=0.1,
                 init_q_val=0.0,
                 terminal_set=None,
                 max_steps=sys.maxsize):

        self.environment = environment
        self.av_coll = ActionValueColl(environment, init_val=init_q_val)

        # assume a constant epsilon for now.
        self.epsgreedy_obj = EpsilonGreedy(epsilon=epsilon,
                                           const_epsilon=True,
                                           half_life=200,
                                           N_episodes_wo_decay=0)

        if terminal_set is None:
            self.terminal_set = environment.terminal_set
        else:
            self.terminal_set = terminal_set

        self.Nsteps = Nsteps
        self.max_steps = max_steps

        self.clear()  # initialize the (s,a,r) data structures, t=0, T=inf.

    def clear(self):
        # The (s,a,r) data will be in circular lists such that the index will wrap-around.
        self.S = CircularList([0] * (self.Nsteps + 1))
        self.A = CircularList([0] * (self.Nsteps + 1))
        self.R = CircularList([0] * (self.Nsteps + 1))

        self.t = 0  # current time value
        self.T = sys.maxsize  # T initialized to infinity
        self.tau = 0  # will be position getting update

    def initialize(self, start_state_hash=None):
        """
        initialize values at self.t
        (If start_state_hash is input use it, otherwise use environment.start_state_hash)
        """

        self.clear()  # initialize the (s,a,r) data structures, t=0, T=inf.

        if start_state_hash is None:
            start_state_hash = self.environment.start_state_hash

        self.S[0] = start_state_hash

        a_desc = self.av_coll.get_best_eps_greedy_action(
            start_state_hash, epsgreedy_obj=self.epsgreedy_obj)

        if a_desc is None:
            self.T = 0  # ending before we start

        self.A[0] = a_desc

        sn_hash, reward = self.environment.get_action_snext_reward(
            self.S[0], self.A[0])
        self.S[1] = sn_hash
        self.R[1] = reward

        if (sn_hash is None) or (sn_hash in self.terminal_set):
            self.T = 1  # ends pretty quickly
        else:
            # add next action, A[1]
            a_desc = self.av_coll.get_best_eps_greedy_action(
                self.S[1], epsgreedy_obj=self.epsgreedy_obj)
            self.A[1] = a_desc
            if a_desc is None:
                self.T = 1  # ending quickly

        self.tau = self.t - self.Nsteps + 1

    def add_step(self):
        """
        Add a step from the ActionValueColl and add it to the lists.
        Assume that self.t has been properly set.
        """

        a_desc = self.A[self.t]

        if not a_desc is None:
            sn_hash, reward = self.environment.get_action_snext_reward(
                self.S[self.t], self.A[self.t])
            self.S[self.t + 1] = sn_hash
            self.R[self.t + 1] = reward

            if (sn_hash is None) or (sn_hash in self.terminal_set):
                self.T = self.t + 1  # terminal
            else:
                # add next action
                a_desc = self.av_coll.get_best_eps_greedy_action(
                    self.S[self.t + 1], epsgreedy_obj=self.epsgreedy_obj)
                self.A[self.t + 1] = a_desc
                if a_desc is None:
                    self.T = self.t + 1  # terminal

    def do_sarsa_action_value_updates(
            self,
            alpha=0.1,
            gamma=1.0,
            start_state_hash=None):  # only used for policy, not episode_obj
        """
        Given an initialized NStepSarsaQStarFinder,
        Iterate through the returns for the episode
        
        Update the ActionValueColl, av_coll as part of the episode iteration.
        
        NOTE: The ActionValueColl will be updated as part of this method.
        """

        self.initialize(start_state_hash=start_state_hash)
        # should have t=0, T=infinity, tau=negative

        total_num_steps = 0

        while self.tau < self.T - 1:
            total_num_steps += 1
            if total_num_steps >= self.max_steps:
                break

            self.t += 1
            if self.t < self.T:
                # Take an action according to policy (or episode_obj)
                self.add_step()

            self.tau = self.t - self.Nsteps + 1
            if self.tau >= 0:
                # ------------------------------
                G = 0.0
                g_pow = 1.0  # gamma**n
                #print('       R=',self.R)
                for i in range(self.tau + 1,
                               min(self.tau + self.Nsteps, self.T) + 1):
                    G += g_pow * self.R[i]
                    g_pow *= gamma
                    #print('             at i=%i, R[i]=%g'%(i, self.R[i]))

                if self.tau + self.Nsteps < self.T:
                    gpow = gamma**self.Nsteps
                    G += g_pow * self.av_coll.get_val(
                        self.S[self.tau + self.Nsteps],
                        self.A[self.tau + self.Nsteps])

                delta = alpha * (G - self.av_coll.get_val(
                    self.S[self.tau], self.A[self.tau]))
                self.av_coll.delta_update(s_hash=self.S[self.tau],
                                          a_desc=self.A[self.tau],
                                          delta=delta)
コード例 #6
0
    def __init__(
        self,
        environment,
        learn_tracker=None,  # track progress of learning
        initial_Qsa=0.0,  # init non-terminal_set of V(s) (terminal_set=0.0)
        initial_action_value_coll=None,  # if input, use it.
        read_pickle_file='',
        save_pickle_file='',
        do_summ_print=True,
        show_last_change=True,
        pcent_progress_print=10,
        show_banner=True,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,  # can be constant or EpsilonGreedy object
        alpha=0.1):  # can be constant or Alpha object
        """
        ... GIVEN AN ENVIRONMENT ... 
        Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a)
        
        Each action is forced to be a DETERMINISTIC action leading to one state and reward.
        (If the next state or reward changes, only the new values will be considered)
            
        attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object
        
        A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute.
        """
        self.environment = environment
        self.learn_tracker = learn_tracker
        self.save_pickle_file = save_pickle_file

        self.do_summ_print = do_summ_print
        self.show_last_change = show_last_change
        self.pcent_progress_print = pcent_progress_print

        self.gamma = gamma
        self.iteration_prints = iteration_prints
        self.max_episode_steps = max_episode_steps

        self.num_episodes = 0
        self.num_updates = 0

        # if input epsilon is a float, use it to create an EpsilonGreedy object
        if type(epsilon) == type(0.1):
            self.epsilon_obj = EpsilonGreedy(epsilon=epsilon,
                                             const_epsilon=True)
        else:
            self.epsilon_obj = epsilon

        # if input alpha is a float, use it to create an Alpha object
        if type(alpha) == type(0.1):
            self.alpha_obj = Alpha(alpha=alpha, const_alpha=True)
        else:
            self.alpha_obj = alpha

        # create the action_value_coll for the environment.
        if initial_action_value_coll is None:
            self.action_value_coll = ActionValueColl(environment,
                                                     init_val=initial_Qsa)
        else:
            self.action_value_coll = initial_action_value_coll

        if read_pickle_file:
            self.action_value_coll.init_from_pickle_file(read_pickle_file)

        # initialize the model that will build from experience
        # do not build full model description on Model init, states not visited
        #  by the RL portion will have no returns values.
        self.model = Model(environment, build_initial_model=False)
        #for s_hash, aD in self.action_value_coll.QsaD.items():
        #    for a_desc, Q in aD.items():
        #        self.model.add_action( s_hash, a_desc )

        if do_summ_print:
            print(
                '================== EPSILON GREEDY DEFINED AS ========================'
            )
            self.epsilon_obj.summ_print()

            print(
                '================== LEARNING RATE DEFINED AS ========================'
            )
            self.alpha_obj.summ_print()

        if show_banner:
            s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\
                '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() )
            banner(s, banner_char='', leftMargin=0, just='center')
コード例 #7
0
class DynaQAgent(object):
    """
    DynaQ Agent.
    """
    def __init__(
        self,
        environment,
        learn_tracker=None,  # track progress of learning
        initial_Qsa=0.0,  # init non-terminal_set of V(s) (terminal_set=0.0)
        initial_action_value_coll=None,  # if input, use it.
        read_pickle_file='',
        save_pickle_file='',
        do_summ_print=True,
        show_last_change=True,
        pcent_progress_print=10,
        show_banner=True,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,  # can be constant or EpsilonGreedy object
        alpha=0.1):  # can be constant or Alpha object
        """
        ... GIVEN AN ENVIRONMENT ... 
        Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a)
        
        Each action is forced to be a DETERMINISTIC action leading to one state and reward.
        (If the next state or reward changes, only the new values will be considered)
            
        attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object
        
        A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute.
        """
        self.environment = environment
        self.learn_tracker = learn_tracker
        self.save_pickle_file = save_pickle_file

        self.do_summ_print = do_summ_print
        self.show_last_change = show_last_change
        self.pcent_progress_print = pcent_progress_print

        self.gamma = gamma
        self.iteration_prints = iteration_prints
        self.max_episode_steps = max_episode_steps

        self.num_episodes = 0
        self.num_updates = 0

        # if input epsilon is a float, use it to create an EpsilonGreedy object
        if type(epsilon) == type(0.1):
            self.epsilon_obj = EpsilonGreedy(epsilon=epsilon,
                                             const_epsilon=True)
        else:
            self.epsilon_obj = epsilon

        # if input alpha is a float, use it to create an Alpha object
        if type(alpha) == type(0.1):
            self.alpha_obj = Alpha(alpha=alpha, const_alpha=True)
        else:
            self.alpha_obj = alpha

        # create the action_value_coll for the environment.
        if initial_action_value_coll is None:
            self.action_value_coll = ActionValueColl(environment,
                                                     init_val=initial_Qsa)
        else:
            self.action_value_coll = initial_action_value_coll

        if read_pickle_file:
            self.action_value_coll.init_from_pickle_file(read_pickle_file)

        # initialize the model that will build from experience
        # do not build full model description on Model init, states not visited
        #  by the RL portion will have no returns values.
        self.model = Model(environment, build_initial_model=False)
        #for s_hash, aD in self.action_value_coll.QsaD.items():
        #    for a_desc, Q in aD.items():
        #        self.model.add_action( s_hash, a_desc )

        if do_summ_print:
            print(
                '================== EPSILON GREEDY DEFINED AS ========================'
            )
            self.epsilon_obj.summ_print()

            print(
                '================== LEARNING RATE DEFINED AS ========================'
            )
            self.alpha_obj.summ_print()

        if show_banner:
            s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\
                '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() )
            banner(s, banner_char='', leftMargin=0, just='center')

    def run_episode(self, start_state, Nplanning_loops=5, iter_sarsn=None):
        """
        Run a single episode of Dyna-Q algorithm
        If iter_sarsn is input, use it instead of action_value_coll calculations.
        (Note: the start_state should NOT be in terminal_set if iter_sarsn is input.)
        """

        # increment episode counters
        self.num_episodes += 1
        self.epsilon_obj.inc_N_episodes()
        self.alpha_obj.inc_N_episodes()

        if self.learn_tracker is not None:
            self.learn_tracker.add_new_episode()

        # do dyna_q loops until sn_hash in terminal_set
        s_hash = start_state

        n_steps_in_episode = 1
        while s_hash not in self.environment.terminal_set:

            if iter_sarsn is None:
                # get best epsilon-greedy action
                a_desc = self.action_value_coll.get_best_eps_greedy_action( \
                                                s_hash, epsgreedy_obj=self.epsilon_obj )
                # check for bad action value
                if a_desc is None:
                    print('break for a_desc==None at s_hash=%s' % str(s_hash))
                    break

                # get next state and reward
                sn_hash, reward = self.environment.get_action_snext_reward(
                    s_hash, a_desc)
            else:
                # retracing an existing episode
                s_hash, a_desc, reward, sn_hash = next(iter_sarsn)

            if self.learn_tracker is not None:
                self.learn_tracker.add_sarsn_to_current_episode(
                    s_hash, a_desc, reward, sn_hash)

            if sn_hash is None:
                print('break for sn_hash==None, #steps=', n_steps_in_episode,
                      ' s_hash=%s' % str(s_hash), ' a_desc=%s' % str(a_desc))
                break

            # do RL update of Q(s,a) value
            self.action_value_coll.qlearning_update(s_hash=s_hash,
                                                    a_desc=a_desc,
                                                    sn_hash=sn_hash,
                                                    alpha=self.alpha_obj(),
                                                    gamma=self.gamma,
                                                    reward=reward)
            self.num_updates += 1
            # give the above experience to the model
            self.model.add_action(s_hash, a_desc)

            # force DETERMINISTIC next state and reward.
            self.model.save_deterministic_action_results(s_hash,
                                                         a_desc,
                                                         sn_hash,
                                                         reward_val=reward)

            # do NOT use simple save_action_results... it allows NON-DETERMINISTIC next state.
            #self.model.save_action_results( s_hash, a_desc, sn_hash, reward_val=reward)

            # --------------------------------- Planning Loop ------------------------
            # make Nplanning_loops calls to model
            for n_plan in range(Nplanning_loops):
                s_model = self.model.get_random_state()
                #print(s_model, end=' ')

                # vanilla DynaQ
                a_model = self.model.get_random_action(s_model)

                #sn_model, r_model = self.environment.get_action_snext_reward( s_model, a_model )
                sn_model, r_model = self.model.get_sample_sn_r(
                    s_model, a_model)

                # update for the DynaQ  results.
                self.action_value_coll.qlearning_update(s_hash=s_model,
                                                        a_desc=a_model,
                                                        sn_hash=sn_model,
                                                        alpha=self.alpha_obj(),
                                                        gamma=self.gamma,
                                                        reward=r_model)
                self.num_updates += 1

            # keep a lid on the max number of episode steps.
            if n_steps_in_episode >= self.max_episode_steps:
                break

            # get ready for next loop
            n_steps_in_episode += 1
            s_hash = sn_hash

        #print(n_steps_in_episode, end=' ')

    def summ_print(self, long=True):  # pragma: no cover
        """Show State objects in sorted state_hash order."""
        print('___ Policy Evaluation Agent Summary ___')
        print('    Environment        = %s' % self.environment.name)
        print('    Number of Episodes = %g' % self.num_episodes)