import matplotlib.pyplot as plt from introrl.td_funcs.sarsa_epsilon_greedy import sarsa_epsilon_greedy from introrl.agent_supt.episode_maker import make_episode from introrl.agent_supt.episode_summ_print import epi_summ_print from introrl.mdp_data.windy_gridworld import get_gridworld from introrl.agent_supt.learning_tracker import LearnTracker gridworld = get_gridworld(step_reward=-1) learn_tracker = LearnTracker() policy, state_value =\ sarsa_epsilon_greedy( gridworld, learn_tracker=learn_tracker, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) read_pickle_file='', save_pickle_file='', use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g', max_num_episodes=170, min_num_episodes=10, max_abserr=0.001, gamma=1.0, iteration_prints=0, max_episode_steps=10000, epsilon=0.1, const_epsilon=True, alpha=0.5, const_alpha=True) print('_' * 55) score = gridworld.get_policy_score(policy, start_state_hash=None, step_limit=1000) print('Policy Score =', score, ' = (r_sum, n_steps, msg)')
if save_pickle_file: policy.save_to_pickle_file( save_pickle_file ) action_value_coll.save_to_pickle_file( save_pickle_file ) return policy, action_value_coll #, steps_per_episodeL, reward_sum_per_episodeL if __name__ == "__main__": # pragma: no cover from introrl.agent_supt.episode_maker import make_episode from introrl.agent_supt.episode_summ_print import epi_summ_print from introrl.agent_supt.learning_tracker import LearnTracker from introrl.mdp_data.simple_grid_world import get_gridworld gridworld = get_gridworld() learn_tracker = LearnTracker() #policy, action_value, steps_per_episodeL, reward_sum_per_episodeL = \ policy, action_value = \ sarsa_epsilon_greedy( gridworld, learn_tracker=learn_tracker, do_summ_print=True, show_last_change=True, fmt_Q='%g', fmt_R='%g', use_list_of_start_states=True, # use list OR single start state of environment. max_num_episodes=10000, min_num_episodes=1000, max_abserr=0.0001, gamma=0.9, alpha=0.3, const_alpha=False, alpha_half_life=10000, epsilon=0.1, # const_epsilon=False, epsilon_half_life=500, max_episode_steps=200, iteration_prints=0)
1 if a_desc == 'L' else 0, s[1] - 1.5 if a_desc == 'L' else 0, (s[0] * s[1] - 3) / 3 if a_desc == 'L' else 0, (s[0] * s[0] - 2) / 2 if a_desc == 'L' else 0, (s[1] * s[1] - 4.5) / 4.5 if a_desc == 'L' else 0, 1 if a_desc == 'L' else 0, s[0] - 1 if a_desc == 'R' else 0, s[1] - 1.5 if a_desc == 'R' else 0, (s[0] * s[1] - 3) / 3 if a_desc == 'R' else 0, (s[0] * s[0] - 2) / 2 if a_desc == 'R' else 0, (s[1] * s[1] - 4.5) / 4.5 if a_desc == 'R' else 0, 1 if a_desc == 'R' else 0, 1 ]) return x_vector learn_tracker = LearnTracker() gridworld = get_gridworld(step_reward=-0.1) NUM_EPISODES = 20000 alpha_obj = Alpha(alpha=0.1) alpha_obj.set_half_life_for_N_episodes(Nepisodes=NUM_EPISODES, alpha_final=0.03333333333333) eps_obj = EpsilonGreedy(epsilon=0.5) eps_obj.set_half_life_for_N_episodes(Nepisodes=NUM_EPISODES, epsilon_final=0.16666666666666) agent = SA_SemiGradAgent(environment=gridworld, update_type='qlearn', sa_linear_function=LazyProgrammerMaze(gridworld),
from future import standard_library standard_library.install_aliases() from builtins import str from builtins import range from builtins import object import matplotlib import matplotlib.pyplot as plt import random from introrl.mdp_data.sutton_dyna_grid import get_gridworld from introrl.agent_supt.learning_tracker import LearnTracker from introrl.policy import Policy from introrl.agents.dyna_q_agent import DynaQAgent learn_tracker_0 = LearnTracker() learn_tracker_5 = LearnTracker() learn_tracker_50 = LearnTracker() gridworld = get_gridworld() #gridworld.summ_print(long=False) print('-' * 77) agent_0 = DynaQAgent(environment=gridworld, learn_tracker=learn_tracker_0, gamma=0.95) agent_5 = DynaQAgent(environment=gridworld, learn_tracker=learn_tracker_5, gamma=0.95) agent_50 = DynaQAgent(environment=gridworld, learn_tracker=learn_tracker_50,
start_time = time.time() CW = CliffWalkingSimulation() CW.layout.s_hash_print(none_str='*') Sarsa_raveL = [] Qlearn_raveL = [] TD0_raveL = [] ExpSarsa_raveL = [] RUN_COUNT = 1000 ALPHA = 0.5 EPSILON = 0.1 learn_tracker = LearnTracker() for loop in range(RUN_COUNT): learn_tracker.clear() policy_es, state_value_es = \ expected_sarsa_eps_greedy( CW, learn_tracker=learn_tracker, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g', show_banner = False, pcent_progress_print=0, max_num_episodes=500, min_num_episodes=10, max_abserr=0.001, gamma=1.0, max_episode_steps=1000, epsilon=EPSILON,
import matplotlib.pyplot as plt from introrl.td_funcs.qlearning_epsilon_greedy import qlearning_epsilon_greedy from introrl.td_funcs.dbl_qlearning_epsilon_greedy import dbl_qlearning_epsilon_greedy from introrl.black_box_sims.maximization_bias_mdp import MaximizationBiasMDP from introrl.agent_supt.learning_tracker import LearnTracker from introrl.utils.smoother import boxcar EPSILON = 0.1 ALPHA = 0.1 GAMMA = 1.0 NUM_EPISODES = 300 TOTAL_RUNS = 1000 Nb_choices = 10 MB = MaximizationBiasMDP(Nb_choices=Nb_choices) learn_tracker = LearnTracker() left_countsL = [0 for _ in range(NUM_EPISODES)] dbl_left_countsL = [0 for _ in range(NUM_EPISODES)] for num_run in range(TOTAL_RUNS): learn_tracker.clear() policy, state_value = \ qlearning_epsilon_greedy( MB, learn_tracker=learn_tracker, initial_Qsa=0.0, # init non-terminal_set of V(s) (terminal_set=0.0) use_list_of_start_states=False, # use list OR single start state of environment. do_summ_print=False, show_last_change=False, fmt_Q='%g', fmt_R='%g', pcent_progress_print=0, show_banner = False, max_num_episodes=NUM_EPISODES, min_num_episodes=NUM_EPISODES, max_abserr=0.001,
for i_sizes in range(NUM_SIZES): N_mult = i_sizes + 1 gridworld = get_gridworld( N_mult=N_mult) #, step_reward=-PRIORITY_THRESHOLD ) Nstates = len(gridworld.SC) print('Nstates =', Nstates, ' N_mult = ', N_mult) grid_sizeL[i_sizes] = N_mult for main_loop in range(NUM_RUNS): print('%i of %i Runs' % (1 + main_loop, NUM_RUNS), end=' ') learn_tracker_q = LearnTracker() learn_tracker_sw = LearnTracker() agent_q = DynaQAgent(environment=gridworld, learn_tracker=learn_tracker_q, max_episode_steps=60000, show_banner=False, do_summ_print=False, show_last_change=False, epsilon=EPSILON, gamma=GAMMA, alpha=ALPHA) agent_sw = PrioritySweepAgent(environment=gridworld, learn_tracker=learn_tracker_sw, max_episode_steps=60000,
GAMMA = 0.95 EPSILON = 0.1 PLAN_LOOPS = 10 QPLUS_FACTOR = 1.0E-4 NUM_RUNS = 20 q_raveL = [] qp_raveL = [] maze_q = BlockingMaze() for main_loop in range(NUM_RUNS): print('%i of %i Runs' % (1 + main_loop, NUM_RUNS)) learn_tracker_q = LearnTracker() learn_tracker_qp = LearnTracker() agent_q = DynaQAgent(environment=maze_q, learn_tracker=learn_tracker_q, max_episode_steps=3000, show_banner=False, do_summ_print=False, show_last_change=False, epsilon=EPSILON, gamma=GAMMA, alpha=ALPHA) agent_qp = DynaQPlusAgent(environment=maze_q, learn_tracker=learn_tracker_qp, max_episode_steps=3000,
ALPHA = 0.7 GAMMA = 0.95 EPSILON = 0.1 PLAN_LOOPS = 10 QPLUS_FACTOR = 1.0E-4 NUM_RUNS = 20 SHOW_DQPLUS = True qp_raveL = [] maze_q = BlockingMaze() for main_loop in range(NUM_RUNS): print('%i of %i Runs' % (1 + main_loop, NUM_RUNS)) learn_tracker_q = LearnTracker() learn_tracker_qp = LearnTracker() agent_qp = DynaQPlusAgent(environment=maze_q, learn_tracker=learn_tracker_qp, max_episode_steps=3000, show_banner=False, do_summ_print=False, show_last_change=False, epsilon=EPSILON, gamma=GAMMA, alpha=ALPHA, qplus_factor=QPLUS_FACTOR) # set gates at time = 0 maze_q.open_gate_R()
ALPHA = 0.5 GAMMA = 0.95 EPSILON = 0.25 NUM_RUNS = 20 q_raveL = [] qp_raveL = [] for main_loop in range(NUM_RUNS): print('\n%i of %i Runs' % (1 + main_loop, NUM_RUNS)) maze_q = BlockingMaze() learn_tracker_q = LearnTracker() # set gates at time = 0 maze_q.open_gate_R() maze_q.close_gate_L() # episodes time_stamp = 0 read_pickle_file = '' for i in range(400): if time_stamp >= 1000: maze_q.open_gate_L() maze_q.close_gate_R() if time_stamp >= 3000: break