コード例 #1
0
def get_td0_data():
    if 'TD0_raveL' in dataD:
        TD0_raveL = dataD['TD0_raveL']
        Nruns = TD0_raveL[0].num_val
        print(Nruns, ' of TD0_raveL found')
    else:
        TD0_raveL = []
        Nruns = 0

    for loop in range(Nruns, RUN_COUNT):

        learn_tracker.clear()
        policy, state_value = \
            td0_epsilon_greedy( CW,   learn_tracker=learn_tracker,
                                  initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                                  use_list_of_start_states=False, # use list OR single start state of environment.
                                  do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g',
                                  pcent_progress_print=0,
                                  show_banner = False,
                                  max_num_episodes=500, min_num_episodes=10, max_abserr=0.001,
                                  gamma=1.0,
                                  max_episode_steps=1000,
                                  epsilon=EPSILON,
                                  alpha=ALPHA)
        reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode()

        while len(reward_sum_per_episodeL) > len(TD0_raveL):
            TD0_raveL.append(RunningAve())
        for R, r in zip(TD0_raveL, reward_sum_per_episodeL):
            R.add_val(r)
    dataD['TD0_raveL'] = TD0_raveL
    save_to_pickle()
コード例 #2
0
def get_td0_data():
    if 'TD0_raveD' in dataD:
        TD0_raveD = dataD['TD0_raveD']
        ave_run_time = dataD['TD0_ave_run_time']
    else:
        TD0_raveD = {}
        ave_run_time = RunningAve()
        for alpha in ALPHA_LIST:
            TD0_raveD[alpha] = [RunningAve(), RunningAve()]

    Nruns = TD0_raveD[0.1][0].num_val
    print(Nruns, ' of TD0_raveD found')

    for loop in range(Nruns, RUN_COUNT):
        for alpha in ALPHA_LIST:

            start_time = time.time()
            learn_tracker.clear()
            policy, state_value = \
                td0_epsilon_greedy( CW,  learn_tracker=learn_tracker,
                                      initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                                      use_list_of_start_states=False, # use list OR single start state of environment.
                                      do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g',
                                      pcent_progress_print=0,
                                      show_banner = False,
                                      max_num_episodes=1000, min_num_episodes=1000, max_abserr=0.000001,
                                      gamma=1.0,
                                      max_episode_steps=10000,
                                      epsilon=EPSILON,
                                      alpha=alpha)
            reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode()

            ave_run_time.add_val(time.time() -
                                 start_time)  # compute average run time
            TD0_raveD[alpha][0].add_val(
                sum(reward_sum_per_episodeL[:100]) / 100.0)
            TD0_raveD[alpha][1].add_val(sum(reward_sum_per_episodeL) / 1000.0)

        print('.', end='')
    print('TD0_ave_run_time = ', ave_run_time.get_ave())

    dataD['TD0_raveD'] = TD0_raveD
    dataD['TD0_ave_run_time'] = ave_run_time
    save_to_pickle('TD0_raveD', 'TD0_ave_run_time')
コード例 #3
0
    reward_sum_per_episodeL_es = learn_tracker.reward_sum_per_episode()

    while len(reward_sum_per_episodeL_es) > len(ExpSarsa_raveL):
        ExpSarsa_raveL.append(RunningAve())
    for R, r in zip(ExpSarsa_raveL, reward_sum_per_episodeL_es):
        R.add_val(r)

    learn_tracker.clear()
    policy_t, state_value_t = \
        td0_epsilon_greedy( CW,  learn_tracker=learn_tracker,
                              initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=False, show_last_change=False, fmt_V='%g', fmt_R='%g',
                              show_banner = False,
                              pcent_progress_print=0,
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001,
                              gamma=1.0,
                              max_episode_steps=1000,
                              epsilon=EPSILON,
                              alpha=ALPHA)

    reward_sum_per_episodeL_t = learn_tracker.reward_sum_per_episode()

    while len(reward_sum_per_episodeL_t) > len(TD0_raveL):
        TD0_raveL.append(RunningAve())
    for R, r in zip(TD0_raveL, reward_sum_per_episodeL_t):
        R.add_val(r)

    learn_tracker.clear()
    policy_s, state_value_s = \
コード例 #4
0
from introrl.agent_supt.episode_maker import make_episode
from introrl.agent_supt.episode_summ_print import epi_summ_print

from introrl.mdp_data.windy_gridworld import get_gridworld
from introrl.agent_supt.learning_tracker import LearnTracker

gridworld = get_gridworld(step_reward=-1)
learn_tracker = LearnTracker()

policy, state_value = \
    td0_epsilon_greedy( gridworld, learn_tracker=learn_tracker,
                        initial_Vs=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                        read_pickle_file='',
                        save_pickle_file='',
                        use_list_of_start_states=False, # use list OR single start state of environment.
                        do_summ_print=True, show_last_change=True, fmt_V='%g', fmt_R='%g',
                        max_num_episodes=170, min_num_episodes=10, max_abserr=0.001, gamma=1.0,
                        iteration_prints=0,
                        max_episode_steps=1000,
                        epsilon=0.1, const_epsilon=True, epsilon_half_life=200,
                        alpha=0.5, const_alpha=True, alpha_half_life=200,
                        N_episodes_wo_decay=0)

print('_' * 55)
score = gridworld.get_policy_score(policy,
                                   start_state_hash=None,
                                   step_limit=1000)
print('Policy Score =', score, ' = (r_sum, n_steps, msg)')

steps_per_episodeL = learn_tracker.steps_per_episode()
print(gridworld.get_info())