from introrl.policy import Policy from introrl.agent_supt.state_value_coll import StateValueColl from introrl.agent_supt.nstep_td_eval_walker import NStepTDWalker from introrl.mdp_data.random_walk_generic_mrp import get_random_walk from introrl.agent_supt.episode_maker import make_episode GAMMA=1.0 AVE_OVER = 100 rw_mrp = get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0) policy = Policy( environment=rw_mrp ) policy.intialize_policy_to_equiprobable() # should be equiprobable from above init already episode_obj = make_episode( 'C', policy, rw_mrp ) fig, ax = plt.subplots() # ---------------- set up true value data for RMS calc -------------------- true_valueD = {'C':0.0} # { 'Win':0.0, 'Lose':0.0} #print('rw_mrp.get_num_states() = ',rw_mrp.get_num_states()) delta = 2.0 / (rw_mrp.get_num_states()-1) Nsides = int( rw_mrp.get_num_states() / 2) - 1 d = 0.0 for i in range(1, Nsides+1 ): d += delta true_valueD[ 'L-%i'%i] = float('%g'%-d) # I got mad about the small bits. true_valueD[ 'R+%i'%i] = float('%g'%d)
epsilon=0.1, const_epsilon=True, alpha=0.5, const_alpha=True) print('_' * 55) score = gridworld.get_policy_score(policy, start_state_hash=None, step_limit=1000) print('Policy Score =', score, ' = (r_sum, n_steps, msg)') steps_per_episodeL = learn_tracker.steps_per_episode() print(gridworld.get_info()) episode = make_episode(gridworld.start_state_hash, policy, gridworld, gridworld.terminal_set, max_steps=20) epi_summ_print(episode, policy, gridworld, show_rewards=False, show_env_states=True, none_str='*') fig, ax = plt.subplots() plt.title('SARSA Windy Gridworld') if 1: plt.xlabel('Time Steps') plt.ylabel('Episodes')
from introrl.agent_supt.action_value_coll import ActionValueColl gridworld = get_gridworld() sv = ActionValueColl(gridworld) pi = Policy(environment=gridworld) pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict()) #pi.summ_print() eg = EpsilonGreedy(epsilon=0.5, const_epsilon=True, half_life=200, N_episodes_wo_decay=0) episode_obj = make_episode((2, 0), pi, gridworld, eps_greedy=None) """environment, Nsteps=3, policy=None, episode_obj=None, terminal_set=None, max_steps=sys.maxsize, eps_greedy=None""" print('Using an episode_obj') episode_obj.summ_print() print(' ...') NSW = NStepTDWalker(gridworld, Nsteps=16, episode_obj=episode_obj) NSW.do_sarsa_action_value_updates(sv, alpha=0.1, gamma=0.9, start_state_hash=None) #print() #gridworld.summ_print()
from introrl.black_box_sims.random_walk_1000 import RandomWalk_1000Simulation from introrl.agent_supt.episode_maker import make_episode from introrl.policy import Policy NUM_EPISODES = 100000 countD = {} # index=state, value=count RW = RandomWalk_1000Simulation() policy = Policy(environment=RW) policy.intialize_policy_to_equiprobable( env=RW ) for Nepi in range(NUM_EPISODES): episode = make_episode(500, policy, RW, max_steps=10000) for dr in episode.get_rev_discounted_returns( gamma=1.0 ): (s_hash, a_desc, reward, sn_hash, G) = dr countD[ s_hash ] = countD.get( s_hash, 0 ) + 1 SUM_VISITS = sum( list(countD.values()) ) freqL = [] for i in range(1,1001): freqL.append( countD.get(i,0) / float(SUM_VISITS) ) # copy and paste list into plot script print('freqL =', repr(freqL))
sarsa_epsilon_greedy( sim, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g', max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, iteration_prints=0, max_episode_steps=1000, epsilon=0.1, const_epsilon=True, epsilon_half_life=200, alpha=0.1, const_alpha=True, alpha_half_life=200, N_episodes_wo_decay=0) episode = make_episode(sim.start_state_hash, policy, sim, sim.terminal_set, max_steps=20) epi_summ_print(episode, policy, sim, show_rewards=False, show_env_states=True, none_str='*') sim.random_transition_prob = 0.0 # so arrows are drawn deterministically on policy diagram policy.save_diagram(sim, inp_colorD=None, save_name='sample_sim_policy', show_arrows=True, scale=1.0,
from introrl.agent_supt.episode_maker import make_episode from introrl.agent_supt.episode_summ_print import epi_summ_print MB = MaximizationBiasMDP() MB.layout.s_hash_print(none_str='*') policy, state_value = \ qlearning_epsilon_greedy( MB, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g', pcent_progress_print=0, show_banner = True, max_num_episodes=10, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=100, epsilon=0.1, alpha=0.1) episode = make_episode(MB.start_state_hash, policy, MB, MB.terminal_set, max_steps=20) epi_summ_print(episode, policy, MB, show_rewards=False, show_env_states=True, none_str='*')
alpha_mc = 0.02 gamma = 1.0 true_valueD = {'A':1.0/6.0, 'B':2.0/6.0, 'C':3.0/6.0, 'D':4.0/6.0, 'E':5.0/6.0} for o_loop in range(1,101): print('%2i'%o_loop, end=' ') if o_loop % 20 == 0: print() # make 2 state value objects. sv_td = StateValueColl( rw_mrp, init_val=0.5 ) sv_mc = StateValueColl( rw_mrp, init_val=0.5 ) for i_loop in range(NumEpisodes): episode = make_episode('C', policy, rw_mrp, rw_mrp.terminal_set) for dr in episode.get_rev_discounted_returns( gamma=gamma ): (s_hash, a_desc, reward, sn_hash, G) = dr sv_mc.mc_update( s_hash, alpha_mc, G) sv_td.td0_update( s_hash=s_hash, alpha=alpha_td, gamma=gamma, sn_hash=sn_hash, reward=reward) # add this loops state values to running_ave mc_rms_raveL[i_loop].add_val( sv_mc.calc_rms_error( true_valueD ) ) td_rms_raveL[i_loop].add_val( sv_td.calc_rms_error( true_valueD ) ) mc_rmsL = [R.get_ave() for R in mc_rms_raveL]
def mc_every_visit_prediction( policy, state_value_coll, all_start_states=False, do_summ_print=True, show_last_change=True, show_banner=True, max_episode_steps=10000, alpha=0.1, const_alpha=True, alpha_half_life=200, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, result_list='abserr', true_valueD=None, value_snapshot_loopL=None ): # if input, save V(s) snapshot at iteration steps indicated """ ... GIVEN A POLICY TO EVALUATE apply Monte Carlo Every Visit Prediction Use Episode Discounted Returns to find V(s), State-Value Function Terminates when abserr < max_abserr Assume that V(s), state_value_coll, has been initialized prior to call. (Note tht the StateValues object has a reference to the Environment object) Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any start state. state_value_coll WILL BE CHANGED... policy WILL NOT. """ resultL = [] # based on result_list, can be "rms" or "abserr" value_snapD = {} # index=loop counter, value=dict of {s_hash:Vs, ...} # ==> Note: the reference to Environment object as "state_value_coll.environment" Env = state_value_coll.environment episode = Episode(Env.name + ' Episode') alpha_obj = Alpha(alpha=alpha, const_alpha=const_alpha, half_life=alpha_half_life) if do_summ_print: print( '=============== EVALUATING THE FOLLOWING POLICY ====================' ) policy.summ_print(verbosity=0, environment=Env, show_env_states=False, none_str='*') if all_start_states: s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g' % ( max_num_episodes, gamma) start_stateL = [s_hash for s_hash in Env.iter_all_action_states()] else: s = 'Starting a Maximum of %i Monte Carlo Iterations from state "%s"\nGamma = %g' % ( max_num_episodes, str(Env.start_state_hash), gamma) start_stateL = [Env.start_state_hash] if show_banner: banner(s, banner_char='', leftMargin=0, just='center') num_episodes = 0 keep_looping = True # value-iteration stopping criteria progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria # policy evaluation random.shuffle(start_stateL) for start_hash in start_stateL: # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, Env, Env.terminal_set, episode=episode, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns(gamma=gamma): (s_hash, a_desc, reward, sn_hash, G) = dr state_value_coll.mc_update(s_hash, alpha_obj(), G) abserr = state_value_coll.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: print(out_str, end=' ') progress_str = out_str if result_list == 'rms': resultL.append(state_value_coll.calc_rms_error(true_valueD)) if result_list == 'abserr': resultL.append(abserr) else: pass # don't save anything to resultL if value_snapshot_loopL is not None and num_episodes in value_snapshot_loopL: value_snapD[num_episodes] = state_value_coll.get_snapshot() if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited MC Every-Visit Policy Evaluation', s) print(' num episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) state_value_coll.summ_print(show_last_change=show_last_change, show_states=True) return resultL, value_snapD
def mc_first_visit_prediction( policy, state_value_ave, first_visit=True, do_summ_print=True, showRunningAve=False, max_episode_steps=10000, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9): """ ... GIVEN A POLICY TO EVALUATE apply Monte Carlo First Visit Prediction Use Episode Discounted Returns to find V(s), State-Value Function Terminates when abserr < max_abserr Assume that V(s), state_value_ave, has been initialized prior to call. (Note tht the StateValues object has a reference to the Environment object) Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any start state. state_value_ave WILL BE CHANGED... policy WILL NOT. """ # ==> Note: the reference to Environment object as "state_value_ave.environment" Env = state_value_ave.environment episode = Episode( Env.name + ' Episode' ) if do_summ_print: print('=============== EVALUATING THE FOLLOWING POLICY ====================') policy.summ_print( verbosity=0, environment=Env, show_env_states=False, none_str='*') s = 'Starting a Maximum of %i Monte Carlo All-Start-State Iterations\nGamma = %g'%(max_num_episodes, gamma) banner(s, banner_char='', leftMargin=0, just='center') keep_looping = True # value-iteration stopping criteria progress_str = '' num_episodes = 0 while (num_episodes<=max_num_episodes-1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria # policy evaluation for start_hash in Env.iter_all_action_states( randomize=True ): # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, Env, Env.terminal_set, episode=episode, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns( gamma=gamma, first_visit=first_visit, visit_type='S'): (s_hash, a_desc, reward, sn_hash, G) = dr state_value_ave.add_val( s_hash, G) abserr = state_value_ave.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%i%%'%( 5*(int(pc_done/5.0) ) ) if out_str != progress_str: print(out_str, end=' ') progress_str = out_str if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print( 'Exited MC First-Visit Policy Evaluation', s ) print( ' num episodes =', num_episodes, ' (min limit=%i)'%min_num_episodes, ' (max limit=%i)'%max_num_episodes ) print( ' gamma =', gamma ) print( ' estimated err =', abserr ) print( ' Error limit =', max_abserr ) state_value_ave.summ_print( showRunningAve=showRunningAve, show_states=True) return abserr
def mc_exploring_starts(environment, initial_policy='default', read_pickle_file='', save_pickle_file='', first_visit=True, do_summ_print=True, showRunningAve=False, fmt_Q='%g', fmt_R='%g', show_initial_policy=True, max_num_episodes=1000, min_num_episodes=10, max_abserr=0.001, gamma=0.9, max_episode_steps=10000, iteration_prints=0): """ ... GIVEN AN ENVIRONMENT ... apply Monte Carlo Exploring Starts to find the OPTIMAL POLICY initial_policy can be 'default', 'random', policy_dictionary, Policy object Returns: Policy and ActionValueRunAveColl objects Use Episode Discounted Returns to find Q(s,a), Action-Value Function Terminates when abserr < max_abserr Assume that Q(s,a), action_value_ave, has been initialized prior to call. Assume environment attached to policy will have method "get_any_action_state_hash" in order to begin at any action state. CREATES BOTH policy AND action_value OBJECTS. """ # create Policy and ActionValueRunAveColl objects policy = Policy(environment=environment) if initial_policy == 'default': print('Initializing Policy to "default" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(environment.get_default_policy_desc_dict()) elif initial_policy == 'random': print('Initializing Policy to "random" in mc_exploring_starts') policy.intialize_policy_to_random(env=environment) elif isinstance(initial_policy, Policy): policy = initial_policy else: print('Initializing Policy to "custom policy" in mc_exploring_starts') policy.learn_a_legal_action_from_env(env=environment) policy.set_policy_from_piD(initial_policy) action_value_ave = ActionValueRunAveColl(environment) action_value_ave.init_Qsa_to_zero( ) # Terminal states w/o an action are NOT included #action_value_ave.summ_print() if read_pickle_file: policy.init_from_pickle_file(read_pickle_file) action_value_ave.init_from_pickle_file(read_pickle_file) if do_summ_print: if show_initial_policy: print( '=============== STARTING WITH THE INITIAL POLICY ====================' ) policy.summ_print(verbosity=0, environment=environment, show_env_states=False, none_str='*') s = 'Starting a Maximum of %i Monte Carlo Exploring Start Episodes\nfor "%s" with Gamma = %g'%\ (max_num_episodes, environment.name, gamma) banner(s, banner_char='', leftMargin=0, just='center') # create an Episode object for getting returns episode = Episode(environment.name + ' Episode') # set counter and flag num_episodes = 0 keep_looping = True progress_str = '' while (num_episodes <= max_num_episodes - 1) and keep_looping: keep_looping = False abserr = 0.0 # calculated below as part of termination criteria for start_hash in environment.iter_all_action_states(randomize=True): a_descL = environment.get_state_legal_action_list(start_hash) # randomize action order random.shuffle(a_descL) # try every initial action for each start_hash for a_desc in a_descL: # break from inner loop if max_num_episodes is hit. if num_episodes >= max_num_episodes: break make_episode(start_hash, policy, environment, environment.terminal_set, episode=episode, first_a_desc=a_desc, max_steps=max_episode_steps, eps_greedy=None) num_episodes += 1 for dr in episode.get_rev_discounted_returns( gamma=gamma, first_visit=first_visit, visit_type='SA'): # look at each step from episode and calc average Q(s,a) (s, a, r, sn, G) = dr action_value_ave.add_val(s, a, G) aL = environment.get_state_legal_action_list(s) if aL: best_a_desc, best_a_val = aL[0], float('-inf') bestL = [best_a_desc] for a in aL: q = action_value_ave.get_ave(s, a) if q > best_a_val: best_a_desc, best_a_val = a, q bestL = [a] elif q == best_a_val: bestL.append(a) best_a_desc = random.choice(bestL) policy.set_sole_action(s, best_a_desc) abserr = action_value_ave.get_biggest_action_state_err() if abserr > max_abserr: keep_looping = True if num_episodes < min_num_episodes: keep_looping = True # must loop for min_num_episodes at least pc_done = 100.0 * float(num_episodes) / float(max_num_episodes) out_str = '%3i%%' % (5 * (int(pc_done / 5.0))) if out_str != progress_str: score = environment.get_policy_score(policy=policy, start_state_hash=None, step_limit=1000) print(out_str, ' score=%s' % str(score), ' = (r_sum, n_steps, msg)', ' estimated err =', abserr) progress_str = out_str if do_summ_print: s = '' if num_episodes >= max_num_episodes: s = ' (NOTE: STOPPED ON MAX-ITERATIONS)' print('Exited MC First-Visit Value Iteration', s) print(' num episodes =', num_episodes, ' (min limit=%i)' % min_num_episodes, ' (max limit=%i)' % max_num_episodes) print(' gamma =', gamma) print(' estimated err =', abserr) print(' Error limit =', max_abserr) action_value_ave.summ_print(showRunningAve=showRunningAve, fmt_Q=fmt_Q) policy.summ_print(environment=environment, verbosity=0, show_env_states=False) try: # sims may not have a layout_print environment.layout_print(vname='reward', fmt=fmt_R, show_env_states=False, none_str='*') except: pass if save_pickle_file: policy.save_to_pickle_file(save_pickle_file) action_value_ave.save_to_pickle_file(save_pickle_file) return policy, action_value_ave
gridworld = get_gridworld() pi = Policy(environment=gridworld) pi.set_policy_from_piD(gridworld.get_default_policy_desc_dict()) #pi.summ_print() eg = EpsilonGreedy(epsilon=0.2, const_epsilon=True, half_life=200, N_episodes_wo_decay=0) episode = make_episode((2, 0), pi, gridworld, gridworld.terminal_set, eps_greedy=eg) episode.summ_print() epi_summ_print(episode, pi, gridworld, show_rewards=True, show_env_states=True, none_str='*') epi_summ_print(episode, pi, gridworld,