Ejemplo n.º 1
0
def get_qlearning_data():
    if 'Qlearn_raveL' in dataD:
        Qlearn_raveL = dataD['Qlearn_raveL']
        Nruns = Qlearn_raveL[0].num_val
        print(Nruns, ' of Qlearn_raveL found')
    else:
        Qlearn_raveL = []
        Nruns = 0

    for loop in range(Nruns, RUN_COUNT):

        learn_tracker.clear()
        policy, state_value = \
            qlearning_epsilon_greedy( CW,   learn_tracker=learn_tracker,
                                  initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                                  use_list_of_start_states=False, # use list OR single start state of environment.
                                  do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                                  pcent_progress_print=0,
                                  show_banner = False,
                                  max_num_episodes=500, min_num_episodes=10, max_abserr=0.001,
                                  gamma=1.0,
                                  max_episode_steps=1000,
                                  epsilon=EPSILON,
                                  alpha=ALPHA)
        reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode()

        while len(reward_sum_per_episodeL) > len(Qlearn_raveL):
            Qlearn_raveL.append(RunningAve())
        for R, r in zip(Qlearn_raveL, reward_sum_per_episodeL):
            R.add_val(r)
    dataD['Qlearn_raveL'] = Qlearn_raveL
    save_to_pickle()
Ejemplo n.º 2
0
def get_qlearning_data():
    if 'Qlearn_raveD' in dataD:
        Qlearn_raveD = dataD['Qlearn_raveD']
        ave_run_time = dataD['Qlearn_ave_run_time']
    else:
        Qlearn_raveD = {}
        ave_run_time = RunningAve()
        for alpha in ALPHA_LIST:
            Qlearn_raveD[alpha] = [RunningAve(), RunningAve()]

    Nruns = Qlearn_raveD[0.1][0].num_val
    print(Nruns, ' of Qlearn_raveD found')

    for loop in range(Nruns, RUN_COUNT):
        for alpha in ALPHA_LIST:

            start_time = time.time()
            learn_tracker.clear()
            policy, state_value = \
                qlearning_epsilon_greedy( CW,  learn_tracker=learn_tracker,
                                      initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                                      use_list_of_start_states=False, # use list OR single start state of environment.
                                      do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                                      pcent_progress_print=0,
                                      show_banner = False,
                                      max_num_episodes=1000, min_num_episodes=1000, max_abserr=0.000001,
                                      gamma=1.0,
                                      max_episode_steps=10000,
                                      epsilon=EPSILON,
                                      alpha=alpha)
            reward_sum_per_episodeL = learn_tracker.reward_sum_per_episode()

            ave_run_time.add_val(time.time() -
                                 start_time)  # compute average run time
            Qlearn_raveD[alpha][0].add_val(
                sum(reward_sum_per_episodeL[:100]) / 100.0)
            Qlearn_raveD[alpha][1].add_val(
                sum(reward_sum_per_episodeL) / 1000.0)

        print('.', end='')
    print('Qlearn_ave_run_time = ', ave_run_time.get_ave())

    dataD['Qlearn_raveD'] = Qlearn_raveD
    dataD['Qlearn_ave_run_time'] = ave_run_time
    save_to_pickle('Qlearn_raveD', 'Qlearn_ave_run_time')
Ejemplo n.º 3
0
    import os, sys
    from introrl.td_funcs.qlearning_epsilon_greedy import qlearning_epsilon_greedy
    from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy
    from introrl.agent_supt.episode_maker import make_episode
    from introrl.agent_supt.episode_summ_print import epi_summ_print

    MB = MaximizationBiasMDP()
    MB.layout.s_hash_print(none_str='*')

    policy, state_value = \
        qlearning_epsilon_greedy( MB,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                              pcent_progress_print=0,
                              show_banner = True,
                              max_num_episodes=10, min_num_episodes=10, max_abserr=0.001,
                              gamma=1.0,
                              max_episode_steps=100,
                              epsilon=0.1,
                              alpha=0.1)

    episode = make_episode(MB.start_state_hash,
                           policy,
                           MB,
                           MB.terminal_set,
                           max_steps=20)
    epi_summ_print(episode,
                   policy,
                   MB,
                   show_rewards=False,
Ejemplo n.º 4
0
        reward_sum_per_episodeL_s = learn_tracker.reward_sum_per_episode()

        while len(reward_sum_per_episodeL_s) > len(Sarsa_raveL):
            Sarsa_raveL.append( RunningAve() )
        for R,r in zip(Sarsa_raveL,  reward_sum_per_episodeL_s):
            R.add_val( r )
            
            
        learn_tracker.clear()
        policy_q, state_value_q = \
            qlearning_epsilon_greedy( CW,  learn_tracker=learn_tracker,
                                  initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                                  use_list_of_start_states=False, # use list OR single start state of environment.
                                  do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                                  max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, 
                                  gamma=1.0,
                                  max_episode_steps=1000,
                                  epsilon=EPSILON, 
                                  alpha=ALPHA)
                                  
        reward_sum_per_episodeL_q = learn_tracker.reward_sum_per_episode()

        while len(reward_sum_per_episodeL_q) > len(Qlearn_raveL):
            Qlearn_raveL.append( RunningAve() )
        for R,r in zip(Qlearn_raveL,  reward_sum_per_episodeL_q):
            R.add_val( r )

# make a list of the averages
reward_sum_per_episodeL_q = [R.get_ave() for R in Qlearn_raveL]
reward_sum_per_episodeL_s = [R.get_ave() for R in Sarsa_raveL]
Ejemplo n.º 5
0
from introrl.agent_supt.episode_maker import make_episode
from introrl.agent_supt.episode_summ_print import epi_summ_print

from introrl.mdp_data.windy_gridworld import get_gridworld
from introrl.agent_supt.learning_tracker import LearnTracker

gridworld = get_gridworld(step_reward=-1)
learn_tracker = LearnTracker()

policy, state_value = \
    qlearning_epsilon_greedy( gridworld, learn_tracker=learn_tracker,
                          initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                          read_pickle_file='',
                          save_pickle_file='',
                          use_list_of_start_states=False, # use list OR single start state of environment.
                          do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                          max_num_episodes=170, min_num_episodes=10, max_abserr=0.001, gamma=1.0,
                          iteration_prints=0,
                          max_episode_steps=1000,
                          epsilon=0.1, const_epsilon=True, epsilon_half_life=200,
                          alpha=0.5, const_alpha=True, alpha_half_life=200,
                          N_episodes_wo_decay=0)

print('_' * 55)
score = gridworld.get_policy_score(policy,
                                   start_state_hash=None,
                                   step_limit=1000)
print('Policy Score =', score, ' = (r_sum, n_steps, msg)')

steps_per_episodeL = learn_tracker.steps_per_episode()

print(gridworld.get_info())