def init_Qsa_to_zero(self): # initialize to 0.0 for all states, terminal and non-terminal. for s_hash in self.environment.iter_all_states(): if s_hash not in self.Qsa_RaveD: self.Qsa_RaveD[s_hash] = {} # may not be any actions in terminal state, so set None action. if s_hash in self.environment.terminal_set: self.Qsa_RaveD[s_hash][None] = RunningAve(name=str(s_hash) + ' None') aL = self.environment.get_state_legal_action_list(s_hash) for a_desc in aL: self.Qsa_RaveD[s_hash][a_desc] = RunningAve(name=str(s_hash) + ' ' + str(a_desc))
def save_action_results(self, a_desc, sn_hash, reward_val, force_deterministic=False): """ Add sn_hash to possible next states and add to its RunningAve If force_deterministic is True, force the new sn_hash to be unique """ # make sure that a_desc is initialized if a_desc not in self.action_countD: self.add_action( a_desc ) # increment action counters self.action_countD[ a_desc ] += 1 # inc. count of a_desc calls self.total_action_calls += 1 # make sure sn_hash dict is initialized for a_desc if a_desc not in self.action_sn_rD: self.action_sn_rD[ a_desc ] = {} # snD... index=sn_hash: value=RunningAve of Reward # save sn_hash and update reward running average for (a_desc, sn_hash) if sn_hash not in self.action_sn_rD[ a_desc ]: self.action_sn_rD[ a_desc ][ sn_hash ] = \ RunningAve( name= 'Reward (%s, %s, %s)'%(str(self.s_hash), str(a_desc), str(sn_hash)) ) # update the RunningAve of (a_desc, sn_hash) with current reward_val self.action_sn_rD[ a_desc ][sn_hash].add_val( reward_val ) if force_deterministic and (len(self.action_sn_rD[ a_desc ])>1): # remove any sn_hash other than the current input sn_hash D = {sn_hash: self.action_sn_rD[ a_desc ][sn_hash]} self.action_sn_rD[ a_desc ] = D self.action_sn_rD[ a_desc ][sn_hash].set_all_attrib( 1, reward_val, reward_val, reward_val)
def get_td0_data(): if 'TD0_raveL' in dataD: TD0_raveL = dataD['TD0_raveL'] Nruns = TD0_raveL[0].num_val print(Nruns, ' of TD0_raveL found') else: TD0_raveL = [] Nruns = 0 for loop in range(Nruns, RUN_COUNT): learn_tracker.clear() policy, state_value = \ td0_epsilon_greedy( CW, learn_tracker=learn_tracker, initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g', pcent_progress_print=0, show_banner = False, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000, epsilon=EPSILON, alpha=ALPHA) reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode() while len(reward_sum_per_episodeL) > len(TD0_raveL): TD0_raveL.append(RunningAve()) for R, r in zip(TD0_raveL, reward_sum_per_episodeL): R.add_val(r) dataD['TD0_raveL'] = TD0_raveL save_to_pickle()
def get_estimated_rewards(self): """ Return a dictionary of estimated rewards for each state. AND a dictionary of any special message (Will be exact for deterministic environment) """ est_rD = {} # index=s_hash, value=float reward estimate. msgD = {} # index=s_hash, value=any special message # initialize all rewards to zero for all states. for S in self.SC.iter_states(): est_rD[S.hash] = RunningAve(S.hash) for s_hash, a_desc, T in self.TC.iter_all_transitions(): for sn_hash, t_prob, reward in T.iter_sn_hash_prob_reward(): Robj = T.get_reward_obj(sn_hash) if Robj.reward_type == CONST: est_rD[sn_hash].add_val(reward) else: msgD[sn_hash] = 'est' # if the reward is stochastic, average 100 values for i in range(100): est_rD[sn_hash].add_val(Robj()) # Need to convert RunningAve objects to float for (s_hash, RA) in est_rD.items(): est_rD[s_hash] = RA.get_ave() #print(s_hash, RA) return est_rD, msgD
def get_expected_sarsa_data(): if 'ExpSarsa_raveD' in dataD: ExpSarsa_raveD = dataD['ExpSarsa_raveD'] ave_run_time = dataD['ExpSarsa_ave_run_time'] else: ExpSarsa_raveD = {} ave_run_time = RunningAve() for alpha in ALPHA_LIST: ExpSarsa_raveD[alpha] = [RunningAve(), RunningAve()] Nruns = ExpSarsa_raveD[0.1][0].num_val print(Nruns, ' of ExpSarsa_raveD found') for loop in range(Nruns, RUN_COUNT): for alpha in ALPHA_LIST: start_time = time.time() learn_tracker.clear() policy, state_value = \ expected_sarsa_eps_greedy( CW, learn_tracker=learn_tracker, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g', pcent_progress_print=0, show_banner = False, max_num_episodes=1000, min_num_episodes=1000, max_abserr=0.000001, gamma=1.0, max_episode_steps=10000, epsilon=EPSILON, alpha=alpha) reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode() ave_run_time.add_val(time.time() - start_time) # compute average run time ExpSarsa_raveD[alpha][0].add_val( sum(reward_sum_per_episodeL[:100]) / 100.0) ExpSarsa_raveD[alpha][1].add_val( sum(reward_sum_per_episodeL) / 1000.0) print('.', end='') print('ExpSarsa_ave_run_time = ', ave_run_time.get_ave()) dataD['ExpSarsa_raveD'] = ExpSarsa_raveD dataD['ExpSarsa_ave_run_time'] = ave_run_time save_to_pickle('ExpSarsa_raveD', 'ExpSarsa_ave_run_time')
#print('true_valueD =',true_valueD) #sys.exit() # ----------------------------------------- generate data ------------- alphaL = [0.05*n for n in range(21)] nstepL = [1,2,4,8, 16, 32] nstep_walkerL = [] ave_rms_aveD = {} # index=(alpha, Nsteps), value=RunningAve sv_collD = {} # index=(alpha, Nsteps), value=StateValueColl # create data structures for Nsteps in nstepL: nstep_walkerL.append( NStepTDWalker(rw_mrp, Nsteps=Nsteps, episode_obj=episode_obj) ) for alpha in alphaL: ave_rms_aveD[ (alpha, Nsteps) ] = RunningAve() sv_collD[ (alpha, Nsteps) ] = StateValueColl( rw_mrp, init_val=0.0 ) # begin main loop over runs for loop in range(AVE_OVER): # average rms curves over AVE_OVER runs if loop%10==0: print(loop, end='') else: print('.', end='') # set state variables to 0.0 for Nsteps in nstepL : for alpha in alphaL: sv_collD[ (alpha, Nsteps) ].init_Vs_to_val( 0.0 ) # get the initial RMS
expected_sarsa_eps_greedy( CW, learn_tracker=learn_tracker, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g', show_banner = False, pcent_progress_print=0, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000, epsilon=EPSILON, alpha=ALPHA) reward_sum_per_episodeL_es = learn_tracker.reward_sum_per_episode() while len(reward_sum_per_episodeL_es) > len(ExpSarsa_raveL): ExpSarsa_raveL.append(RunningAve()) for R, r in zip(ExpSarsa_raveL, reward_sum_per_episodeL_es): R.add_val(r) learn_tracker.clear() policy_t, state_value_t = \ td0_epsilon_greedy( CW, learn_tracker=learn_tracker, initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g', show_banner = False, pcent_progress_print=0, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000, epsilon=EPSILON,
import matplotlib import matplotlib.pyplot as plt from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction from introrl.policy import Policy from introrl.agent_supt.state_value_coll import StateValueColl from introrl.mdp_data.random_walk_mrp import get_random_walk from introrl.agent_supt.episode_maker import make_episode from introrl.utils.running_ave import RunningAve rw_mrp = get_random_walk() policy = Policy( environment=rw_mrp ) NumEpisodes = 100 mc_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)] td_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)] alpha_td = 0.1 alpha_mc = 0.02 gamma = 1.0 true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0} for o_loop in range(1,101): print('%2i'%o_loop, end=' ') if o_loop % 20 == 0: print() # make 2 state value objects. sv_td = StateValueColl( rw_mrp, init_val=0.5 ) sv_mc = StateValueColl( rw_mrp, init_val=0.5 )
def init_Vs_to_zero(self): # initialize to 0.0 for all states, terminal and non-terminal. for s_hash in self.environment.iter_all_states(): self.Vs_RaveD[s_hash] = RunningAve(name=s_hash)
sarsa_epsilon_greedy( CW, learn_tracker=learn_tracker, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g', pcent_progress_print=0, show_banner = False, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000, epsilon=EPSILON, alpha=ALPHA) reward_sum_per_episodeL_s = learn_tracker.reward_sum_per_episode() while len(reward_sum_per_episodeL_s) > len(Sarsa_raveL): Sarsa_raveL.append( RunningAve() ) for R,r in zip(Sarsa_raveL, reward_sum_per_episodeL_s): R.add_val( r ) learn_tracker.clear() policy_q, state_value_q = \ qlearning_epsilon_greedy( CW, learn_tracker=learn_tracker, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g', pcent_progress_print=0, show_banner = False, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000,
maze_q.open_gate_R() maze_q.close_gate_L() # DynaQ+ episodes while agent_qp.model.total_action_calls < 3000: if agent_qp.model.total_action_calls >= 1000: maze_q.open_gate_L() maze_q.close_gate_R() agent_qp.run_episode('Start', Nplanning_loops=PLAN_LOOPS) cum_rew_qL = learn_tracker_q.cum_reward_per_step() cum_rew_qpL = learn_tracker_qp.cum_reward_per_step() while len(q_raveL) < min(3000, len(cum_rew_qL)): q_raveL.append(RunningAve()) for i, r in enumerate(cum_rew_qL): if i < 3000: q_raveL[i].add_val(r) while len(qp_raveL) < min(3000, len(cum_rew_qpL)): qp_raveL.append(RunningAve()) for i, r in enumerate(cum_rew_qpL): if i < 3000: qp_raveL[i].add_val(r) #agent_q.model.summ_print(long=True) #sys.exit() agent_q.action_value_coll.summ_print(fmt_Q='%.3f', none_str='*', show_states=True,
import sys import matplotlib import matplotlib.pyplot as plt from introrl.mc_funcs.mc_ev_prediction import mc_every_visit_prediction from introrl.policy import Policy from introrl.agent_supt.state_value_coll import StateValueColl from introrl.mdp_data.random_walk_mrp import get_random_walk from introrl.agent_supt.episode_maker import make_episode from introrl.utils.running_ave import RunningAve rw_mrp = get_random_walk() policy = Policy( environment=rw_mrp ) NumEpisodes = 100 mc_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)] td_rms_raveL = [RunningAve(name='%i'%i) for i in range(NumEpisodes)] alpha = 0.1 gamma = 1.0 true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0} def calc_td_error(show_values=True): errD = {} # index=s_hash, value=Vtarget - V(s) for s_hash in ['A','B','C','D','E']: errD[s_hash] = 0.0 for (s_hash,sn_hash), R in td_averD.items(): errD[s_hash] += R.get_ave() + gamma*sv_td.get_Vs(sn_hash) - sv_td.get_Vs(s_hash)
fig, ax = plt.subplots() # ---------------- set up true value data for RMS calc -------------------- true_valueD = {'C': 0.0, 'Win': 0.0, 'Lose': 0.0} delta = 2.0 / (rw_mrp.get_num_states() + 1) Nsides = int(rw_mrp.get_num_states() / 2) - 1 d = 0.0 for i in range(1, Nsides + 1): d += delta true_valueD = {'L-%i' % i: -d} true_valueD = {'R+%i' % i: d} # ----------------------------------------- generate TD(0) data ------------- alphaL = [0.01] + [0.05 * n for n in range(1, 21)] ave_rms_aveL = [RunningAve(name='alpha=%g' % alpha) for alpha in alphaL] for ialpha, alpha in enumerate(alphaL): for loop in range(100): # average rms curves over 100 runs sv = StateValueColl(rw_mrp, init_val=0.5) resultL, value_snapD = td0_prediction( policy, sv, all_start_states=False, do_summ_print=False, show_last_change=False, show_banner=False, pcent_progress_print=0, alpha=alpha,