def get_td0_data(): if 'TD0_raveL' in dataD: TD0_raveL = dataD['TD0_raveL'] Nruns = TD0_raveL[0].num_val print(Nruns, ' of TD0_raveL found') else: TD0_raveL = [] Nruns = 0 for loop in range(Nruns, RUN_COUNT): learn_tracker.clear() policy, state_value = \ td0_epsilon_greedy( CW, learn_tracker=learn_tracker, initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g', pcent_progress_print=0, show_banner = False, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000, epsilon=EPSILON, alpha=ALPHA) reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode() while len(reward_sum_per_episodeL) > len(TD0_raveL): TD0_raveL.append(RunningAve()) for R, r in zip(TD0_raveL, reward_sum_per_episodeL): R.add_val(r) dataD['TD0_raveL'] = TD0_raveL save_to_pickle()
def get_td0_data(): if 'TD0_raveD' in dataD: TD0_raveD = dataD['TD0_raveD'] ave_run_time = dataD['TD0_ave_run_time'] else: TD0_raveD = {} ave_run_time = RunningAve() for alpha in ALPHA_LIST: TD0_raveD[alpha] = [RunningAve(), RunningAve()] Nruns = TD0_raveD[0.1][0].num_val print(Nruns, ' of TD0_raveD found') for loop in range(Nruns, RUN_COUNT): for alpha in ALPHA_LIST: start_time = time.time() learn_tracker.clear() policy, state_value = \ td0_epsilon_greedy( CW, learn_tracker=learn_tracker, initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g', pcent_progress_print=0, show_banner = False, max_num_episodes=1000, min_num_episodes=1000, max_abserr=0.000001, gamma=1.0, max_episode_steps=10000, epsilon=EPSILON, alpha=alpha) reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode() ave_run_time.add_val(time.time() - start_time) # compute average run time TD0_raveD[alpha][0].add_val( sum(reward_sum_per_episodeL[:100]) / 100.0) TD0_raveD[alpha][1].add_val(sum(reward_sum_per_episodeL) / 1000.0) print('.', end='') print('TD0_ave_run_time = ', ave_run_time.get_ave()) dataD['TD0_raveD'] = TD0_raveD dataD['TD0_ave_run_time'] = ave_run_time save_to_pickle('TD0_raveD', 'TD0_ave_run_time')
reward_sum_per_episodeL_es = learn_tracker.reward_sum_per_episode() while len(reward_sum_per_episodeL_es) > len(ExpSarsa_raveL): ExpSarsa_raveL.append(RunningAve()) for R, r in zip(ExpSarsa_raveL, reward_sum_per_episodeL_es): R.add_val(r) learn_tracker.clear() policy_t, state_value_t = \ td0_epsilon_greedy( CW, learn_tracker=learn_tracker, initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g', show_banner = False, pcent_progress_print=0, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000, epsilon=EPSILON, alpha=ALPHA) reward_sum_per_episodeL_t = learn_tracker.reward_sum_per_episode() while len(reward_sum_per_episodeL_t) > len(TD0_raveL): TD0_raveL.append(RunningAve()) for R, r in zip(TD0_raveL, reward_sum_per_episodeL_t): R.add_val(r) learn_tracker.clear() policy_s, state_value_s = \
from introrl.agent_supt.episode_maker import make_episode from introrl.agent_supt.episode_summ_print import epi_summ_print from introrl.mdp_data.windy_gridworld import get_gridworld from introrl.agent_supt.learning_tracker import LearnTracker gridworld = get_gridworld(step_reward=-1) learn_tracker = LearnTracker() policy, state_value = \ td0_epsilon_greedy( gridworld, learn_tracker=learn_tracker, initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=True, show_last_change=True, fmt_V='%g', fmt_R='%g', max_num_episodes=170, min_num_episodes=10, max_abserr=0.001, gamma=1.0, iteration_prints=0, max_episode_steps=1000, epsilon=0.1, const_epsilon=True, epsilon_half_life=200, alpha=0.5, const_alpha=True, alpha_half_life=200, N_episodes_wo_decay=0) print('_' * 55) score = gridworld.get_policy_score(policy, start_state_hash=None, step_limit=1000) print('Policy Score =', score, ' = (r_sum, n_steps, msg)') steps_per_episodeL = learn_tracker.steps_per_episode() print(gridworld.get_info())