if __name__ == "__main__": # pragma: no cover import time import os, sys from introrl.agent_supt.model import Model from introrl.environments.env_baseline import EnvBaseline from introrl.dp_funcs.dp_value_iter import dp_value_iteration from introrl.utils import pickle_esp start_time = time.time() RW = RandomWalk_1000Simulation() #RW.layout.s_hash_print( none_str='*' ) get_sim = Model( RW, build_initial_model=True ) get_sim.collect_transition_data( num_det_calls=100, num_stoic_calls=10000 ) RW.layout.s_hash_print() #get_sim.num_calls_layout_print() #get_sim.min_num_calls_layout_print() env = EnvBaseline( s_hash_rowL=RW.s_hash_rowL, x_axis_label=RW.x_axis_label, y_axis_label=RW.y_axis_label ) get_sim.add_all_data_to_an_environment( env ) policy, state_value = dp_value_iteration( env, do_summ_print=True, fmt_V='%.3f', fmt_R='%.1f',
if sn_count == len(snD): print() print('____'+'_'*len(header)) if __name__ == "__main__": # pragma: no cover from introrl.agent_supt.model import Model from introrl.mdp_data.simple_grid_world import get_gridworld gridworld = get_gridworld() get_sim = Model( gridworld, build_initial_model=True ) # ---------- make a few stochastic to test summ_print #get_sim.define_statesD[s_hash].save_action_results( a_desc, sn_hash, reward_val) # make just the reward stochastic get_sim.define_statesD[(0, 2)].save_action_results( 'R', (0,3), 2.0) # make the action stochastic get_sim.define_statesD[(1,0)].save_action_results( 'U', 'XXX', 0.0) # make both the action and reward stochastic get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.0) get_sim.define_statesD[(2,2)].save_action_results( 'U', 'XXX', 2.2)
return ['U', 'D', 'R', 'L'] if __name__ == "__main__": # pragma: no cover import time import os, sys from introrl.td_funcs.qlearning_epsilon_greedy import qlearning_epsilon_greedy from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy from introrl.agent_supt.model import Model bmaze = BlockingMaze() bmaze.open_gate_R() bmaze.close_gate_L() env = Model(bmaze, build_initial_model=True) env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000) env.summ_print(long=False) bmaze.layout.s_hash_print(none_str='*') bmaze.open_gate_L() bmaze.close_gate_R() env.collect_transition_data(num_det_calls=10, num_stoic_calls=1000) env.summ_print(long=False) policy, action_value = \ sarsa_epsilon_greedy( bmaze, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment.
from introrl.dp_funcs.dp_policy_eval import dp_policy_evaluation from introrl.black_box_sims.random_walk_1000 import RandomWalk_1000Simulation from introrl.policy import Policy from introrl.state_values import StateValues from introrl.agent_supt.model import Model from introrl.environments.env_baseline import EnvBaseline from introrl.utils import pickle_esp RW = RandomWalk_1000Simulation() model = Model(RW, build_initial_model=True) model.collect_transition_data(num_det_calls=100, num_stoic_calls=10000) print('Model Built') # build an EnvBaseline from the Simulation env = EnvBaseline(s_hash_rowL=RW.s_hash_rowL, x_axis_label=RW.x_axis_label, y_axis_label=RW.y_axis_label) model.add_all_data_to_an_environment(env) policy = Policy(environment=env) policy.intialize_policy_to_equiprobable(env=env) state_value = StateValues(env) state_value.init_Vs_to_zero() dp_policy_evaluation(policy, state_value, do_summ_print=True, max_iter=1000, err_delta=0.0001, gamma=1.0)
def __init__( self, environment, learn_tracker=None, # track progress of learning initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) initial_action_value_coll=None, # if input, use it. read_pickle_file='', save_pickle_file='', do_summ_print=True, show_last_change=True, pcent_progress_print=10, show_banner=True, gamma=0.9, iteration_prints=0, max_episode_steps=sys.maxsize, epsilon=0.1, # can be constant or EpsilonGreedy object alpha=0.1): # can be constant or Alpha object """ ... GIVEN AN ENVIRONMENT ... Use basic Dyna-Q algorithm to solve for STATE-ACTION VALUES, Q(s,a) Each action is forced to be a DETERMINISTIC action leading to one state and reward. (If the next state or reward changes, only the new values will be considered) attribute: self.action_value_coll is the ActionValueColl, Q(s,a) object A DETERMINISTIC policy can be created externally from the self.action_value_coll attribute. """ self.environment = environment self.learn_tracker = learn_tracker self.save_pickle_file = save_pickle_file self.do_summ_print = do_summ_print self.show_last_change = show_last_change self.pcent_progress_print = pcent_progress_print self.gamma = gamma self.iteration_prints = iteration_prints self.max_episode_steps = max_episode_steps self.num_episodes = 0 self.num_updates = 0 # if input epsilon is a float, use it to create an EpsilonGreedy object if type(epsilon) == type(0.1): self.epsilon_obj = EpsilonGreedy(epsilon=epsilon, const_epsilon=True) else: self.epsilon_obj = epsilon # if input alpha is a float, use it to create an Alpha object if type(alpha) == type(0.1): self.alpha_obj = Alpha(alpha=alpha, const_alpha=True) else: self.alpha_obj = alpha # create the action_value_coll for the environment. if initial_action_value_coll is None: self.action_value_coll = ActionValueColl(environment, init_val=initial_Qsa) else: self.action_value_coll = initial_action_value_coll if read_pickle_file: self.action_value_coll.init_from_pickle_file(read_pickle_file) # initialize the model that will build from experience # do not build full model description on Model init, states not visited # by the RL portion will have no returns values. self.model = Model(environment, build_initial_model=False) #for s_hash, aD in self.action_value_coll.QsaD.items(): # for a_desc, Q in aD.items(): # self.model.add_action( s_hash, a_desc ) if do_summ_print: print( '================== EPSILON GREEDY DEFINED AS ========================' ) self.epsilon_obj.summ_print() print( '================== LEARNING RATE DEFINED AS ========================' ) self.alpha_obj.summ_print() if show_banner: s = 'Starting a Maximum of %i Dyna-Q Epsilon Greedy Steps/Episode'%self.max_episode_steps +\ '\nfor "%s" with Gamma = %g, Alpha = %g'%( environment.name, self.gamma, self.alpha_obj() ) banner(s, banner_char='', leftMargin=0, just='center')