Exemple #1
0
if __name__ == "__main__": # pragma: no cover
    
    import time
    import os, sys
    from introrl.agent_supt.model import Model
    from introrl.environments.env_baseline import EnvBaseline
    from introrl.dp_funcs.dp_value_iter import dp_value_iteration
    from introrl.utils import pickle_esp
    
    start_time = time.time()
    
    RW = RandomWalk_1000Simulation()
    #RW.layout.s_hash_print( none_str='*' )
    
    
    get_sim = Model( RW, build_initial_model=True )

    get_sim.collect_transition_data( num_det_calls=100, num_stoic_calls=10000 )

    RW.layout.s_hash_print()

    #get_sim.num_calls_layout_print()
    #get_sim.min_num_calls_layout_print()
    
    env = EnvBaseline( s_hash_rowL=RW.s_hash_rowL, 
                       x_axis_label=RW.x_axis_label, 
                       y_axis_label=RW.y_axis_label )
                       
    get_sim.add_all_data_to_an_environment( env )

    policy, state_value = dp_value_iteration( env, do_summ_print=True, fmt_V='%.3f', fmt_R='%.1f',
Exemple #2
0
                    
                    if sn_count == len(snD):
                        print()
                    
                    
        print('____'+'_'*len(header))
                

if __name__ == "__main__": # pragma: no cover
    
    from introrl.agent_supt.model import Model
    from introrl.mdp_data.simple_grid_world import get_gridworld
    
    gridworld = get_gridworld()
    
    get_sim = Model( gridworld, build_initial_model=True )
    
    # ---------- make a few stochastic to test summ_print
    #get_sim.define_statesD[s_hash].save_action_results( a_desc, sn_hash, reward_val)
    
    # make just the reward stochastic
    get_sim.define_statesD[(0, 2)].save_action_results( 'R', (0,3), 2.0)
    
    # make the action stochastic
    get_sim.define_statesD[(1,0)].save_action_results( 'U', 'XXX', 0.0)
    
    # make both the action and reward stochastic
    get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.0)
    get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.2)

    
Exemple #3
0
        return ['U', 'D', 'R', 'L']


if __name__ == "__main__":  # pragma: no cover

    import time
    import os, sys
    from introrl.td_funcs.qlearning_epsilon_greedy import qlearning_epsilon_greedy
    from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy
    from introrl.agent_supt.model import Model

    bmaze = BlockingMaze()
    bmaze.open_gate_R()
    bmaze.close_gate_L()

    env = Model(bmaze, build_initial_model=True)
    env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000)
    env.summ_print(long=False)

    bmaze.layout.s_hash_print(none_str='*')
    bmaze.open_gate_L()
    bmaze.close_gate_R()
    env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000)
    env.summ_print(long=False)

    policy, action_value = \
        sarsa_epsilon_greedy( bmaze,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              read_pickle_file='',
                              save_pickle_file='',
                              use_list_of_start_states=False, # use list OR single start state of environment.
from introrl.dp_funcs.dp_policy_eval import dp_policy_evaluation
from introrl.black_box_sims.random_walk_1000 import RandomWalk_1000Simulation
from introrl.policy import Policy
from introrl.state_values import StateValues
from introrl.agent_supt.model import Model
from introrl.environments.env_baseline import EnvBaseline
from introrl.utils import pickle_esp

RW = RandomWalk_1000Simulation()

model = Model(RW, build_initial_model=True)
model.collect_transition_data(num_det_calls=100, num_stoic_calls=10000)
print('Model Built')
# build an EnvBaseline from the Simulation
env = EnvBaseline(s_hash_rowL=RW.s_hash_rowL,
                  x_axis_label=RW.x_axis_label,
                  y_axis_label=RW.y_axis_label)
model.add_all_data_to_an_environment(env)

policy = Policy(environment=env)
policy.intialize_policy_to_equiprobable(env=env)

state_value = StateValues(env)
state_value.init_Vs_to_zero()

dp_policy_evaluation(policy,
                     state_value,
                     do_summ_print=True,
                     max_iter=1000,
                     err_delta=0.0001,
                     gamma=1.0)
Exemple #5
0
    def __init__(
        self,
        environment,
        learn_tracker=None,  # track progress of learning
        initial_Qsa=0.0,  # init non-terminal_set of V(s) (terminal_set=0.0)
        initial_action_value_coll=None,  # if input, use it.
        read_pickle_file='',
        save_pickle_file='',
        do_summ_print=True,
        show_last_change=True,
        pcent_progress_print=10,
        show_banner=True,
        gamma=0.9,
        iteration_prints=0,
        max_episode_steps=sys.maxsize,
        epsilon=0.1,  # can be constant or EpsilonGreedy object
        alpha=0.1):  # can be constant or Alpha object
        """
        ... GIVEN AN ENVIRONMENT ... 
        Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a)
        
        Each action is forced to be a DETERMINISTIC action leading to one state and reward.
        (If the next state or reward changes, only the new values will be considered)
            
        attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object
        
        A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute.
        """
        self.environment = environment
        self.learn_tracker = learn_tracker
        self.save_pickle_file = save_pickle_file

        self.do_summ_print = do_summ_print
        self.show_last_change = show_last_change
        self.pcent_progress_print = pcent_progress_print

        self.gamma = gamma
        self.iteration_prints = iteration_prints
        self.max_episode_steps = max_episode_steps

        self.num_episodes = 0
        self.num_updates = 0

        # if input epsilon is a float, use it to create an EpsilonGreedy object
        if type(epsilon) == type(0.1):
            self.epsilon_obj = EpsilonGreedy(epsilon=epsilon,
                                             const_epsilon=True)
        else:
            self.epsilon_obj = epsilon

        # if input alpha is a float, use it to create an Alpha object
        if type(alpha) == type(0.1):
            self.alpha_obj = Alpha(alpha=alpha, const_alpha=True)
        else:
            self.alpha_obj = alpha

        # create the action_value_coll for the environment.
        if initial_action_value_coll is None:
            self.action_value_coll = ActionValueColl(environment,
                                                     init_val=initial_Qsa)
        else:
            self.action_value_coll = initial_action_value_coll

        if read_pickle_file:
            self.action_value_coll.init_from_pickle_file(read_pickle_file)

        # initialize the model that will build from experience
        # do not build full model description on Model init, states not visited
        #  by the RL portion will have no returns values.
        self.model = Model(environment, build_initial_model=False)
        #for s_hash, aD in self.action_value_coll.QsaD.items():
        #    for a_desc, Q in aD.items():
        #        self.model.add_action( s_hash, a_desc )

        if do_summ_print:
            print(
                '================== EPSILON GREEDY DEFINED AS ========================'
            )
            self.epsilon_obj.summ_print()

            print(
                '================== LEARNING RATE DEFINED AS ========================'
            )
            self.alpha_obj.summ_print()

        if show_banner:
            s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\
                '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() )
            banner(s, banner_char='', leftMargin=0, just='center')