import matplotlib.pyplot as plt

from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy
from introrl.agent_supt.episode_maker import make_episode
from introrl.agent_supt.episode_summ_print import epi_summ_print

from introrl.mdp_data.windy_gridworld import get_gridworld
from introrl.agent_supt.learning_tracker import LearnTracker

gridworld = get_gridworld(step_reward=-1)
learn_tracker = LearnTracker()

policy, state_value =\
    sarsa_epsilon_greedy( gridworld, learn_tracker=learn_tracker,
                          initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                          read_pickle_file='',
                          save_pickle_file='',
                          use_list_of_start_states=False, # use list OR single start state of environment.
                          do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g',
                          max_num_episodes=170, min_num_episodes=10, max_abserr=0.001, gamma=1.0,
                          iteration_prints=0,
                          max_episode_steps=10000,
                          epsilon=0.1, const_epsilon=True,
                          alpha=0.5, const_alpha=True)

print('_' * 55)
score = gridworld.get_policy_score(policy,
                                   start_state_hash=None,
                                   step_limit=1000)
print('Policy Score =', score, ' = (r_sum, n_steps, msg)')
Example #2
0
    if save_pickle_file:
        policy.save_to_pickle_file( save_pickle_file )
        action_value_coll.save_to_pickle_file( save_pickle_file )
        
    return policy, action_value_coll #, steps_per_episodeL, reward_sum_per_episodeL

if __name__ == "__main__": # pragma: no cover
    
    from introrl.agent_supt.episode_maker import make_episode
    from introrl.agent_supt.episode_summ_print import epi_summ_print
    from introrl.agent_supt.learning_tracker import LearnTracker
    
    from introrl.mdp_data.simple_grid_world import get_gridworld    
    gridworld = get_gridworld()
    
    learn_tracker = LearnTracker()
    
    #policy, action_value, steps_per_episodeL, reward_sum_per_episodeL = \
    policy, action_value = \
        sarsa_epsilon_greedy( gridworld,  learn_tracker=learn_tracker,
                            do_summ_print=True, show_last_change=True, 
                            fmt_Q='%g', fmt_R='%g',
                            use_list_of_start_states=True, # use list OR single start state of environment.
                            max_num_episodes=10000, min_num_episodes=1000, 
                            max_abserr=0.0001, 
                            gamma=0.9,
                            alpha=0.3,     const_alpha=False, alpha_half_life=10000,
                            epsilon=0.1,  # const_epsilon=False, epsilon_half_life=500,
                            max_episode_steps=200,
                            iteration_prints=0)
                          
Example #3
0
                1 if a_desc == 'L' else 0, s[1] - 1.5 if a_desc == 'L' else 0,
                (s[0] * s[1] - 3) / 3 if a_desc == 'L' else 0,
                (s[0] * s[0] - 2) / 2 if a_desc == 'L' else 0,
                (s[1] * s[1] - 4.5) / 4.5 if a_desc == 'L' else 0,
                1 if a_desc == 'L' else 0, s[0] -
                1 if a_desc == 'R' else 0, s[1] - 1.5 if a_desc == 'R' else 0,
                (s[0] * s[1] - 3) / 3 if a_desc == 'R' else 0,
                (s[0] * s[0] - 2) / 2 if a_desc == 'R' else 0,
                (s[1] * s[1] - 4.5) / 4.5 if a_desc == 'R' else 0,
                1 if a_desc == 'R' else 0, 1
            ])

        return x_vector


learn_tracker = LearnTracker()
gridworld = get_gridworld(step_reward=-0.1)

NUM_EPISODES = 20000

alpha_obj = Alpha(alpha=0.1)
alpha_obj.set_half_life_for_N_episodes(Nepisodes=NUM_EPISODES,
                                       alpha_final=0.03333333333333)

eps_obj = EpsilonGreedy(epsilon=0.5)
eps_obj.set_half_life_for_N_episodes(Nepisodes=NUM_EPISODES,
                                     epsilon_final=0.16666666666666)

agent = SA_SemiGradAgent(environment=gridworld,
                         update_type='qlearn',
                         sa_linear_function=LazyProgrammerMaze(gridworld),
Example #4
0
from future import standard_library
standard_library.install_aliases()
from builtins import str
from builtins import range
from builtins import object

import matplotlib
import matplotlib.pyplot as plt
import random

from introrl.mdp_data.sutton_dyna_grid import get_gridworld
from introrl.agent_supt.learning_tracker import LearnTracker
from introrl.policy import Policy
from introrl.agents.dyna_q_agent import DynaQAgent

learn_tracker_0 = LearnTracker()
learn_tracker_5 = LearnTracker()
learn_tracker_50 = LearnTracker()

gridworld = get_gridworld()
#gridworld.summ_print(long=False)
print('-' * 77)

agent_0 = DynaQAgent(environment=gridworld,
                     learn_tracker=learn_tracker_0,
                     gamma=0.95)
agent_5 = DynaQAgent(environment=gridworld,
                     learn_tracker=learn_tracker_5,
                     gamma=0.95)
agent_50 = DynaQAgent(environment=gridworld,
                      learn_tracker=learn_tracker_50,
start_time = time.time()

CW = CliffWalkingSimulation()
CW.layout.s_hash_print(none_str='*')

Sarsa_raveL = []
Qlearn_raveL = []
TD0_raveL = []
ExpSarsa_raveL = []

RUN_COUNT = 1000
ALPHA = 0.5
EPSILON = 0.1

learn_tracker = LearnTracker()

for loop in range(RUN_COUNT):

    learn_tracker.clear()
    policy_es, state_value_es = \
        expected_sarsa_eps_greedy( CW, learn_tracker=learn_tracker,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                              show_banner = False,
                              pcent_progress_print=0,
                              max_num_episodes=500, min_num_episodes=10, max_abserr=0.001,
                              gamma=1.0,
                              max_episode_steps=1000,
                              epsilon=EPSILON,
Example #6
0
import matplotlib.pyplot as plt
from introrl.td_funcs.qlearning_epsilon_greedy import qlearning_epsilon_greedy
from introrl.td_funcs.dbl_qlearning_epsilon_greedy import dbl_qlearning_epsilon_greedy
from introrl.black_box_sims.maximization_bias_mdp import MaximizationBiasMDP
from introrl.agent_supt.learning_tracker import LearnTracker
from introrl.utils.smoother import boxcar

EPSILON = 0.1
ALPHA = 0.1
GAMMA = 1.0
NUM_EPISODES = 300
TOTAL_RUNS = 1000
Nb_choices = 10

MB = MaximizationBiasMDP(Nb_choices=Nb_choices)
learn_tracker = LearnTracker()

left_countsL = [0 for _ in range(NUM_EPISODES)]
dbl_left_countsL = [0 for _ in range(NUM_EPISODES)]

for num_run in range(TOTAL_RUNS):

    learn_tracker.clear()
    policy, state_value = \
        qlearning_epsilon_greedy( MB, learn_tracker=learn_tracker,
                              initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0)
                              use_list_of_start_states=False, # use list OR single start state of environment.
                              do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g',
                              pcent_progress_print=0,
                              show_banner = False,
                              max_num_episodes=NUM_EPISODES, min_num_episodes=NUM_EPISODES, max_abserr=0.001,
Example #7
0

for i_sizes in range(NUM_SIZES):

    N_mult = i_sizes + 1

    gridworld = get_gridworld(
        N_mult=N_mult)  #, step_reward=-PRIORITY_THRESHOLD )
    Nstates = len(gridworld.SC)
    print('Nstates =', Nstates, '  N_mult = ', N_mult)
    grid_sizeL[i_sizes] = N_mult

    for main_loop in range(NUM_RUNS):
        print('%i of %i Runs' % (1 + main_loop, NUM_RUNS), end=' ')

        learn_tracker_q = LearnTracker()
        learn_tracker_sw = LearnTracker()

        agent_q = DynaQAgent(environment=gridworld,
                             learn_tracker=learn_tracker_q,
                             max_episode_steps=60000,
                             show_banner=False,
                             do_summ_print=False,
                             show_last_change=False,
                             epsilon=EPSILON,
                             gamma=GAMMA,
                             alpha=ALPHA)

        agent_sw = PrioritySweepAgent(environment=gridworld,
                                      learn_tracker=learn_tracker_sw,
                                      max_episode_steps=60000,
GAMMA = 0.95
EPSILON = 0.1
PLAN_LOOPS = 10
QPLUS_FACTOR = 1.0E-4

NUM_RUNS = 20

q_raveL = []
qp_raveL = []

maze_q = BlockingMaze()

for main_loop in range(NUM_RUNS):
    print('%i of %i Runs' % (1 + main_loop, NUM_RUNS))

    learn_tracker_q = LearnTracker()
    learn_tracker_qp = LearnTracker()

    agent_q = DynaQAgent(environment=maze_q,
                         learn_tracker=learn_tracker_q,
                         max_episode_steps=3000,
                         show_banner=False,
                         do_summ_print=False,
                         show_last_change=False,
                         epsilon=EPSILON,
                         gamma=GAMMA,
                         alpha=ALPHA)

    agent_qp = DynaQPlusAgent(environment=maze_q,
                              learn_tracker=learn_tracker_qp,
                              max_episode_steps=3000,
Example #9
0
ALPHA = 0.7
GAMMA = 0.95
EPSILON = 0.1
PLAN_LOOPS = 10
QPLUS_FACTOR = 1.0E-4

NUM_RUNS = 20
SHOW_DQPLUS = True

qp_raveL = []
maze_q = BlockingMaze()

for main_loop in range(NUM_RUNS):
    print('%i of %i Runs' % (1 + main_loop, NUM_RUNS))

    learn_tracker_q = LearnTracker()
    learn_tracker_qp = LearnTracker()

    agent_qp = DynaQPlusAgent(environment=maze_q,
                              learn_tracker=learn_tracker_qp,
                              max_episode_steps=3000,
                              show_banner=False,
                              do_summ_print=False,
                              show_last_change=False,
                              epsilon=EPSILON,
                              gamma=GAMMA,
                              alpha=ALPHA,
                              qplus_factor=QPLUS_FACTOR)

    # set gates at time = 0
    maze_q.open_gate_R()
Example #10
0
ALPHA = 0.5
GAMMA = 0.95
EPSILON = 0.25

NUM_RUNS = 20

q_raveL = []
qp_raveL = []

for main_loop in range(NUM_RUNS):
    print('\n%i of %i Runs' % (1 + main_loop, NUM_RUNS))

    maze_q = BlockingMaze()

    learn_tracker_q = LearnTracker()

    # set gates at time = 0
    maze_q.open_gate_R()
    maze_q.close_gate_L()

    # episodes
    time_stamp = 0
    read_pickle_file = ''
    for i in range(400):
        if time_stamp >= 1000:
            maze_q.open_gate_L()
            maze_q.close_gate_R()

        if time_stamp >= 3000:
            break