Exemple #1
0
    def summ_print(self, long=False, time_stamp=None): # pragma: no cover
        
        Model.summ_print(self, long=long)
        
        if time_stamp is None:
            """approximate time_stamp with largest model time_stamp"""
            time_stamp = 0
            for t in self.state_action_time_stampD.values():
                time_stamp = max(t, time_stamp)
                
        
        # get all states and figure out formatting
        sL = sorted( [s_hash for s_hash in self.define_statesD.keys()], key=NaturalOrStrKey )
        max_len = max(6, max([len( str(s) ) for s in sL]))
        fmt = '%' + '%is'%max_len
        
        # get all actions for each state and figure out formatting
        astrL = [RSA.get_action_desc_str() for RSA in self.define_statesD.values()]
        max_a_len = max(6, max([len( str(a) ) for a in astrL]))
        fmt_a = '%' + '%is'%max_a_len

        max_a2_len = 0
        max_det_len = 0
        for s_hash in sL:
            RSA = self.define_statesD[s_hash]
            aL = [a_desc for a_desc in RSA.action_countD.keys()]
            max_a2_len = max(max_a2_len, max( [len(a) for a in aL] ))
            
            max_det_len = max(max_det_len, len(RSA.get_state_deterministic_desc().strip()) )
            
        fmt_a2 = '%' + '%is'%max_a2_len
        fmt_det ='%-' + '%is'%max_det_len
                        
        print('___________________________________________________')
        print('             State/Action TimeStamps               ')
        print('___________________________________________________')
        for s_hash in sL:
            RSA = self.define_statesD[s_hash]
            
            aL = sorted( [a_desc for a_desc in RSA.action_countD.keys()], key=NaturalOrStrKey )
            #print('aL =',aL, type(aL))
            
            # self.state_action_time_stampD = {} # index=(s_hash,a_desc), value=time_stamp
            tstampL =  [ fmt_a2%str(a)+'=%i'%(time_stamp - self.state_action_time_stampD[(s_hash,a)],) for a in aL ]
            
            print( fmt%str(s_hash), fmt_a%RSA.get_action_desc_str(),
                   '...', fmt_det%RSA.get_state_deterministic_desc().strip(),' Age:', ', '.join(tstampL) )
Exemple #2
0
if __name__ == "__main__": # pragma: no cover
    
    import time
    import os, sys
    from introrl.agent_supt.model import Model
    from introrl.environments.env_baseline import EnvBaseline
    from introrl.dp_funcs.dp_value_iter import dp_value_iteration
    from introrl.utils import pickle_esp
    
    start_time = time.time()
    
    RW = RandomWalk_1000Simulation()
    #RW.layout.s_hash_print( none_str='*' )
    
    
    get_sim = Model( RW, build_initial_model=True )

    get_sim.collect_transition_data( num_det_calls=100, num_stoic_calls=10000 )

    RW.layout.s_hash_print()

    #get_sim.num_calls_layout_print()
    #get_sim.min_num_calls_layout_print()
    
    env = EnvBaseline( s_hash_rowL=RW.s_hash_rowL, 
                       x_axis_label=RW.x_axis_label, 
                       y_axis_label=RW.y_axis_label )
                       
    get_sim.add_all_data_to_an_environment( env )

    policy, state_value = dp_value_iteration( env, do_summ_print=True, fmt_V='%.3f', fmt_R='%.1f',
Exemple #3
0
        return lim_stateL


if __name__ == "__main__":  # pragma: no cover

    import time
    import os, sys
    from introrl.dp_funcs.dp_value_iter import dp_value_iteration
    from introrl.environments.env_baseline import EnvBaseline
    from introrl.agent_supt.model import Model

    start_time = time.time()

    BJ = BlackJackSimulation()

    get_sim = Model(BJ, build_initial_model=True)

    # if there's a pickle file, read it
    fname = os.path.split(__file__)[-1].split('.')[
        0]  # use file prefix for pickle file
    print('Pickle File Name Prefix:', fname)

    if not get_sim.read_pickle_file(fname):
        get_sim.collect_transition_data(num_det_calls=10,
                                        num_stoic_calls=10000)

    #get_sim.collect_transition_data( num_det_calls=10, num_stoic_calls=10000 )

    print('Total recorded actions Before:',
          "{:,}".format(get_sim.total_num_action_data_points()))
Exemple #4
0
                    
                    if sn_count == len(snD):
                        print()
                    
                    
        print('____'+'_'*len(header))
                

if __name__ == "__main__": # pragma: no cover
    
    from introrl.agent_supt.model import Model
    from introrl.mdp_data.simple_grid_world import get_gridworld
    
    gridworld = get_gridworld()
    
    get_sim = Model( gridworld, build_initial_model=True )
    
    # ---------- make a few stochastic to test summ_print
    #get_sim.define_statesD[s_hash].save_action_results( a_desc, sn_hash, reward_val)
    
    # make just the reward stochastic
    get_sim.define_statesD[(0, 2)].save_action_results( 'R', (0,3), 2.0)
    
    # make the action stochastic
    get_sim.define_statesD[(1,0)].save_action_results( 'U', 'XXX', 0.0)
    
    # make both the action and reward stochastic
    get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.0)
    get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.2)

    
Exemple #5
0

if __name__ == "__main__":  # pragma: no cover

    import time
    import os, sys
    from introrl.dp_funcs.dp_value_iter import dp_value_iteration
    from introrl.environments.env_baseline import EnvBaseline
    from introrl.agent_supt.model import Model
    from introrl.utils import pickle_esp

    start_time = time.time()

    CR = CarRentalSimulation()

    get_sim = Model(CR, build_initial_model=True)

    get_sim.collect_transition_data(num_det_calls=50, num_stoic_calls=100000)

    print('Total recorded actions Before:',
          "{:,}".format(get_sim.total_num_action_data_points()))

    CR.layout.s_hash_print()
    get_sim.num_calls_layout_print(row_tickL=[c for c in '   First Location'],
                                   const_col_w=True,
                                   x_axis_label='Second Location',
                                   none_str='*')

    get_sim.min_num_calls_layout_print(
        row_tickL=[c for c in '   First Location'],
        const_col_w=True,
Exemple #6
0
 def __init__(self, env_interface, build_initial_model=False): # Interface (can be sim or env)
     
     # add dictionary to track time_stamp
     self.state_action_time_stampD = {} # index=(s_hash,a_desc), value=time_stamp
     
     Model.__init__(self, env_interface, build_initial_model=build_initial_model )
Exemple #7
0
        return ['U', 'D', 'R', 'L']


if __name__ == "__main__":  # pragma: no cover

    import time
    import os, sys
    from introrl.td_funcs.qlearning_epsilon_greedy import qlearning_epsilon_greedy
    from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy
    from introrl.agent_supt.model import Model

    bmaze = BlockingMaze()
    bmaze.open_gate_R()
    bmaze.close_gate_L()

    env = Model(bmaze, build_initial_model=True)
    env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000)
    env.summ_print(long=False)

    bmaze.layout.s_hash_print(none_str='*')
    bmaze.open_gate_L()
    bmaze.close_gate_R()
    env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000)
    env.summ_print(long=False)

    policy, action_value = \
        sarsa_epsilon_greedy( bmaze,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              read_pickle_file='',
                              save_pickle_file='',
                              use_list_of_start_states=False, # use list OR single start state of environment.
Exemple #8
0
if __name__ == "__main__":  # pragma: no cover

    import time
    import os, sys
    from introrl.dp_funcs.dp_value_iter import dp_value_iteration
    from introrl.environments.env_baseline import EnvBaseline
    #from introrl.black_boxes.collect_sim_data import CollectSimData
    from introrl.agent_supt.model import Model

    start_time = time.time()

    s_hash_rowL = ((0, 1, 2, 3, 4), )
    CR = Simulation(s_hash_rowL=s_hash_rowL)

    #get_sim = CollectSimData( CR )
    get_sim = Model(CR, build_initial_model=True)

    # if there's a pickle file, read it
    fname = os.path.split(__file__)[-1].split('.')[
        0]  # use file prefix for pickle file
    print('Pickle File Name Prefix:', fname)

    if not get_sim.read_pickle_file(fname):
        get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=1000)

    print('Total recorded actions Before:',
          "{:,}".format(get_sim.total_num_action_data_points()))
    get_sim.collect_transition_data(num_det_calls=10, num_stoic_calls=100)
    print('Total recorded actions After:',
          "{:,}".format(get_sim.total_num_action_data_points()))
Exemple #9
0
import time
from introrl.dp_funcs.dp_value_iter import dp_value_iteration
from introrl.environments.env_baseline import EnvBaseline
from introrl.agent_supt.model import Model
from introrl.utils import pickle_esp
from introrl.black_box_sims.blackjack_sim import BlackJackSimulation

start_time = time.time()

BJ = BlackJackSimulation()
get_sim = Model(BJ, build_initial_model=True)

get_sim.collect_transition_data(num_det_calls=50, num_stoic_calls=100000)

BJ.layout.s_hash_print()

get_sim.num_calls_layout_print()
get_sim.min_num_calls_layout_print()

print('got sim data')
print('_' * 55)

env = EnvBaseline(s_hash_rowL=BJ.s_hash_rowL,
                  x_axis_label=BJ.x_axis_label,
                  y_axis_label=BJ.y_axis_label)

get_sim.add_all_data_to_an_environment(env)

print('built environment')
print('_' * 55)
Exemple #10
0
    def __init__(
        self,
        environment,
        learn_tracker=None,  # track progress of learning
        initial_Qsa=0.0,  # init non-terminal_set of V(s) (terminal_set=0.0)
        initial_action_value_coll=None,  # if input, use it.
        read_pickle_file='',
        save_pickle_file='',
        do_summ_print=True,
        show_last_change=True,
        pcent_progress_print=10,
        show_banner=True,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,  # can be constant or EpsilonGreedy object
        alpha=0.1):  # can be constant or Alpha object
        """
        ... GIVEN AN ENVIRONMENT ... 
        Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a)
        
        Each action is forced to be a DETERMINISTIC action leading to one state and reward.
        (If the next state or reward changes, only the new values will be considered)
            
        attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object
        
        A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute.
        """
        self.environment = environment
        self.learn_tracker = learn_tracker
        self.save_pickle_file = save_pickle_file

        self.do_summ_print = do_summ_print
        self.show_last_change = show_last_change
        self.pcent_progress_print = pcent_progress_print

        self.gamma = gamma
        self.iteration_prints = iteration_prints
        self.max_episode_steps = max_episode_steps

        self.num_episodes = 0
        self.num_updates = 0

        # if input epsilon is a float, use it to create an EpsilonGreedy object
        if type(epsilon) == type(0.1):
            self.epsilon_obj = EpsilonGreedy(epsilon=epsilon,
                                             const_epsilon=True)
        else:
            self.epsilon_obj = epsilon

        # if input alpha is a float, use it to create an Alpha object
        if type(alpha) == type(0.1):
            self.alpha_obj = Alpha(alpha=alpha, const_alpha=True)
        else:
            self.alpha_obj = alpha

        # create the action_value_coll for the environment.
        if initial_action_value_coll is None:
            self.action_value_coll = ActionValueColl(environment,
                                                     init_val=initial_Qsa)
        else:
            self.action_value_coll = initial_action_value_coll

        if read_pickle_file:
            self.action_value_coll.init_from_pickle_file(read_pickle_file)

        # initialize the model that will build from experience
        # do not build full model description on Model init, states not visited
        #  by the RL portion will have no returns values.
        self.model = Model(environment, build_initial_model=False)
        #for s_hash, aD in self.action_value_coll.QsaD.items():
        #    for a_desc, Q in aD.items():
        #        self.model.add_action( s_hash, a_desc )

        if do_summ_print:
            print(
                '================== EPSILON GREEDY DEFINED AS ========================'
            )
            self.epsilon_obj.summ_print()

            print(
                '================== LEARNING RATE DEFINED AS ========================'
            )
            self.alpha_obj.summ_print()

        if show_banner:
            s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\
                '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() )
            banner(s, banner_char='', leftMargin=0, just='center')
Exemple #11
0
class DynaQAgent(object):
    """
    DynaQ Agent.
    """
    def __init__(
        self,
        environment,
        learn_tracker=None,  # track progress of learning
        initial_Qsa=0.0,  # init non-terminal_set of V(s) (terminal_set=0.0)
        initial_action_value_coll=None,  # if input, use it.
        read_pickle_file='',
        save_pickle_file='',
        do_summ_print=True,
        show_last_change=True,
        pcent_progress_print=10,
        show_banner=True,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,  # can be constant or EpsilonGreedy object
        alpha=0.1):  # can be constant or Alpha object
        """
        ... GIVEN AN ENVIRONMENT ... 
        Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a)
        
        Each action is forced to be a DETERMINISTIC action leading to one state and reward.
        (If the next state or reward changes, only the new values will be considered)
            
        attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object
        
        A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute.
        """
        self.environment = environment
        self.learn_tracker = learn_tracker
        self.save_pickle_file = save_pickle_file

        self.do_summ_print = do_summ_print
        self.show_last_change = show_last_change
        self.pcent_progress_print = pcent_progress_print

        self.gamma = gamma
        self.iteration_prints = iteration_prints
        self.max_episode_steps = max_episode_steps

        self.num_episodes = 0
        self.num_updates = 0

        # if input epsilon is a float, use it to create an EpsilonGreedy object
        if type(epsilon) == type(0.1):
            self.epsilon_obj = EpsilonGreedy(epsilon=epsilon,
                                             const_epsilon=True)
        else:
            self.epsilon_obj = epsilon

        # if input alpha is a float, use it to create an Alpha object
        if type(alpha) == type(0.1):
            self.alpha_obj = Alpha(alpha=alpha, const_alpha=True)
        else:
            self.alpha_obj = alpha

        # create the action_value_coll for the environment.
        if initial_action_value_coll is None:
            self.action_value_coll = ActionValueColl(environment,
                                                     init_val=initial_Qsa)
        else:
            self.action_value_coll = initial_action_value_coll

        if read_pickle_file:
            self.action_value_coll.init_from_pickle_file(read_pickle_file)

        # initialize the model that will build from experience
        # do not build full model description on Model init, states not visited
        #  by the RL portion will have no returns values.
        self.model = Model(environment, build_initial_model=False)
        #for s_hash, aD in self.action_value_coll.QsaD.items():
        #    for a_desc, Q in aD.items():
        #        self.model.add_action( s_hash, a_desc )

        if do_summ_print:
            print(
                '================== EPSILON GREEDY DEFINED AS ========================'
            )
            self.epsilon_obj.summ_print()

            print(
                '================== LEARNING RATE DEFINED AS ========================'
            )
            self.alpha_obj.summ_print()

        if show_banner:
            s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\
                '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() )
            banner(s, banner_char='', leftMargin=0, just='center')

    def run_episode(self, start_state, Nplanning_loops=5, iter_sarsn=None):
        """
        Run a single episode of Dyna-Q algorithm
        If iter_sarsn is input, use it instead of action_value_coll calculations.
        (Note: the start_state should NOT be in terminal_set if iter_sarsn is input.)
        """

        # increment episode counters
        self.num_episodes += 1
        self.epsilon_obj.inc_N_episodes()
        self.alpha_obj.inc_N_episodes()

        if self.learn_tracker is not None:
            self.learn_tracker.add_new_episode()

        # do dyna_q loops until sn_hash in terminal_set
        s_hash = start_state

        n_steps_in_episode = 1
        while s_hash not in self.environment.terminal_set:

            if iter_sarsn is None:
                # get best epsilon-greedy action
                a_desc = self.action_value_coll.get_best_eps_greedy_action( \
                                                s_hash, epsgreedy_obj=self.epsilon_obj )
                # check for bad action value
                if a_desc is None:
                    print('break for a_desc==None at s_hash=%s' % str(s_hash))
                    break

                # get next state and reward
                sn_hash, reward = self.environment.get_action_snext_reward(
                    s_hash, a_desc)
            else:
                # retracing an existing episode
                s_hash, a_desc, reward, sn_hash = next(iter_sarsn)

            if self.learn_tracker is not None:
                self.learn_tracker.add_sarsn_to_current_episode(
                    s_hash, a_desc, reward, sn_hash)

            if sn_hash is None:
                print('break for sn_hash==None, #steps=', n_steps_in_episode,
                      ' s_hash=%s' % str(s_hash), ' a_desc=%s' % str(a_desc))
                break

            # do RL update of Q(s,a) value
            self.action_value_coll.qlearning_update(s_hash=s_hash,
                                                    a_desc=a_desc,
                                                    sn_hash=sn_hash,
                                                    alpha=self.alpha_obj(),
                                                    gamma=self.gamma,
                                                    reward=reward)
            self.num_updates += 1
            # give the above experience to the model
            self.model.add_action(s_hash, a_desc)

            # force DETERMINISTIC next state and reward.
            self.model.save_deterministic_action_results(s_hash,
                                                         a_desc,
                                                         sn_hash,
                                                         reward_val=reward)

            # do NOT use simple save_action_results... it allows NON-DETERMINISTIC next state.
            #self.model.save_action_results( s_hash, a_desc, sn_hash, reward_val=reward)

            # --------------------------------- Planning Loop ------------------------
            # make Nplanning_loops calls to model
            for n_plan in range(Nplanning_loops):
                s_model = self.model.get_random_state()
                #print(s_model, end=' ')

                # vanilla DynaQ
                a_model = self.model.get_random_action(s_model)

                #sn_model, r_model = self.environment.get_action_snext_reward( s_model, a_model )
                sn_model, r_model = self.model.get_sample_sn_r(
                    s_model, a_model)

                # update for the DynaQ  results.
                self.action_value_coll.qlearning_update(s_hash=s_model,
                                                        a_desc=a_model,
                                                        sn_hash=sn_model,
                                                        alpha=self.alpha_obj(),
                                                        gamma=self.gamma,
                                                        reward=r_model)
                self.num_updates += 1

            # keep a lid on the max number of episode steps.
            if n_steps_in_episode >= self.max_episode_steps:
                break

            # get ready for next loop
            n_steps_in_episode += 1
            s_hash = sn_hash

        #print(n_steps_in_episode, end=' ')

    def summ_print(self, long=True):  # pragma: no cover
        """Show State objects in sorted state_hash order."""
        print('___ Policy Evaluation Agent Summary ___')
        print('    Environment        = %s' % self.environment.name)
        print('    Number of Episodes = %g' % self.num_episodes)