def experiment(p): # Parameters gamma = .9 n_env = 5 size = p['size'] env_distribution = make_env_distribution( env_class='tight', n_env=n_env, gamma=gamma, env_name=p['name'], w=size, h=size, stochastic=p['stochastic'] ) actions = env_distribution.get_actions() n_known = p['n_known'] p_min = 1. / float(n_env) epsilon_q = .01 epsilon_m = .01 delta = .1 r_max = 1. v_max = p['v_max'] n_states = 4 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmax_p01 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.1)') lrmax_p02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinit_p01 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)') lrmaxqinit_p02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.2)') agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p02] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=3, n_tasks=p['n_tasks'], n_episodes=p['n_episodes'], n_steps=p['n_steps'], reset_at_terminal=False, open_plot=False, plot_title=True, do_run=True, do_plot=True, parallel_run=True, n_processes=None)
def reset(self): """ Reset the attributes to initial state (called between instances). Save the previous model. :return: None """ LRMax.reset(self) n_bound_use = self.n_rmax + self.n_lip if n_bound_use > 0: # Save ratio ratio_rmax_bound_use = self.n_rmax / n_bound_use ratio_lip_bound_use = self.n_lip / n_bound_use if self.write_data: self.write(init=False, ratio_rmax_bound_use=ratio_rmax_bound_use, ratio_lip_bound_use=ratio_lip_bound_use) # Reset self.n_rmax = 0 self.n_lip = 0 # Reset recorded variables between MDPs self.discounted_return = 0. self.total_return = 0. self.n_time_steps = 0 self.update_time_steps = []
def experiment(): n_env = 5 env_distribution = make_env_distribution(env_class='maze-mono-goal', env_name='maze-mono-goal', n_env=n_env, gamma=GAMMA) actions = env_distribution.get_actions() p_min = 1. / float(n_env) delta = .1 m = 100 max_mem = 10 rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m) rmax_q = MaxQInit(actions=actions, gamma=GAMMA, count_threshold=m, min_sampling_probability=p_min, delta=delta) lrmax1 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=1.) lrmax05 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.5) lrmax02 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.2) lrmax_learn = LRMax( actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=None, min_sampling_probability=p_min, delta=delta ) agents_pool = [rmax, lrmax1, lrmax05, lrmax02, lrmax_learn, rmax_q] run_agents_lifelong( agents_pool, env_distribution, samples=20, episodes=100, steps=1000, reset_at_terminal=False, open_plot=True, cumulative_plot=False, is_tracked_value_discounted=True, plot_only=False, plot_title=False )
def reset(self): """ Reset the attributes to initial state (called between instances). Save the previous model. :return: None """ self.update_sa_memory() LRMax.reset(self)
def __init__(self, actions, gamma=.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, max_memory_size=None, prior=None, estimate_distances_online=True, min_sampling_probability=.1, name="ExpLRMax", path='results/'): LRMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_memory_size, prior=prior, estimate_distances_online=estimate_distances_online, min_sampling_probability=min_sampling_probability, name=name) # Counters used for experiments (not useful to the algorithm) self.n_rmax = 0 # number of times the rmax bound is used for all the updates of 1 task self.n_lip = 0 # number of times the lipschitz bound is used for all the updates of 1 task # Counter for prior use self.n_prior_use = 0 # number of times the prior is used for each update of 1 task self.n_dista_use = 0 # number of times the distance is used for each update of 1 task # Recorded variables self.discounted_return = 0. self.total_return = 0. self.n_time_steps = 0 # number of time steps self.path = path self.write_data = False # Enable data writing self.instance_number = 0 self.run_number = 0
def __init__( self, actions, gamma=.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, max_memory_size=None, prior=None, estimate_distances_online=True, min_sampling_probability=.1, name="LRMaxQInit" ): """ :param actions: action space of the environment :param gamma: (float) discount factor :param r_max: (float) known upper-bound on the reward function :param v_max: (float) known upper-bound on the value function :param deduce_v_max: (bool) set to True to deduce v_max from r_max :param n_known: (int) count after which a state-action pair is considered known (only set n_known if delta and epsilon are not defined) :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m) :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation :param epsilon_m: (float) precision of the learned models in L1 norm :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta :param n_states: (int) number of states :param max_memory_size: (int) maximum number of saved models (infinity if None) :param prior: (float) prior knowledge of maximum model's distance :param estimate_distances_online: (bool) set to True for online estimation of a tighter upper-bound for the model pseudo-distances. The estimation is valid with high probability. :param min_sampling_probability: (float) minimum sampling probability of an environment :param name: (str) """ self.name = name self.n_required_tasks = mqi.number_of_tasks_for_high_confidence_upper_bound(delta, min_sampling_probability) self.maxQ_memory = [] # Upper-bounds on the Q-values of previous MDPs LRMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_memory_size, prior=prior, estimate_distances_online=estimate_distances_online, min_sampling_probability=min_sampling_probability, name=name)
def experiment(): # Parameters gamma = .9 env_distribution = make_env_distribution(env_class='deterministic-super-tight', env_name='deterministic-super-tight-bignknown', gamma=gamma) actions = env_distribution.get_actions() n_known = 100 p_min = 1. / 3. epsilon_q = .01 epsilon_m = .01 delta = .1 r_max = 1. v_max = 1. n_states = 4 max_mem = 9 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.1)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinitprior = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)') agents_pool = [rmax, lrmax, lrmaxprior, maxqinit, lrmaxqinit, lrmaxqinitprior] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=1, n_tasks=100, n_episodes=200, n_steps=100, reset_at_terminal=False, open_plot=False, plot_title=True, do_run=False, do_plot=True, parallel_run=True, n_processes=None)
def __init__(self, actions, gamma=.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, max_memory_size=None, prior=None, estimate_distances_online=True, min_sampling_probability=.1, name="ExpLRMax"): """ See LRMax class. """ LRMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_memory_size, prior=prior, estimate_distances_online=estimate_distances_online, min_sampling_probability=min_sampling_probability, name=name) self.time_step = 0 self.time_step_counter = [] self.data = {'n_computation': [0], 'n_prior_use': [0]}
def reset(self): """ Reset the attributes to initial state (called between instances). Save the previous model. :return: None """ LRMax.reset(self) ''' n_bound_use = self.n_rmax + self.n_lip if n_bound_use > 0: if self.write_data: self.write(init=False) ''' # Reset counters self.reset_counters() # Reset recorded variables between MDPs self.discounted_return = 0. self.total_return = 0. self.n_time_steps = 0
def experiment(): # Parameters gamma = .9 env_distribution = make_env_distribution(env_class='stochastic-tight', env_name='stochastic-tight', gamma=gamma) actions = env_distribution.get_actions() n_known = 10 p_min = 1. / 7. # There are seven possible MDPs epsilon_q = .1 epsilon_m = .01 delta = .1 r_max = 1. v_max = 1. n_states = 4 max_mem = 10 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') agents_pool = [rmax, lrmax, lrmaxprior, maxqinit] # , lrmaxqinit] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=5, n_tasks=50, n_episodes=50, n_steps=100, reset_at_terminal=False, plot_only=False, open_plot=True, plot_title=True)
def example(): n_env = 4 env_distribution = make_env_distribution(env_class='test', n_env=n_env, gamma=GAMMA, w=60, h=20) actions = env_distribution.get_actions() m = 1 # Count threshold max_mem = None p_min = 1. / float(n_env) delta = 0.99 lrmax = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=None, min_sampling_probability=p_min, delta=delta) rmax_max_q_init = MaxQInit(actions=actions, gamma=GAMMA, count_threshold=m, min_sampling_probability=p_min, delta=delta) rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m) run_agents_lifelong([rmax_max_q_init, lrmax, rmax], env_distribution, samples=10, episodes=10, steps=100, reset_at_terminal=False, open_plot=True, cumulative_plot=False, is_tracked_value_discounted=True, plot_only=False)
def experiment(): # Parameters gamma = .9 n_env = 5 n_states = 20 env_distribution = make_env_distribution(env_class='corridor', n_env=n_env, gamma=gamma, w=n_states, h=1) actions = env_distribution.get_actions() n_known = 1 p_min = 1. / float(n_env) r_max = 1. v_max = 10. epsilon_q = .01 epsilon_m = .01 delta = .1 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=False, min_sampling_probability=p_min, name='LRMax(0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinitprior02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.2)') agents_pool = [ rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02 ] # Run run_agents_lifelong(agents_pool, env_distribution, name_identifier=None, n_instances=1, n_tasks=20, n_episodes=20, n_steps=11, reset_at_terminal=False, do_run=False, do_plot=True, open_plot=False, episodes_moving_average=False, episodes_ma_width=10, tasks_moving_average=False, tasks_ma_width=10, latex_rendering=True, plot_title=False)
def experiment(p, name): # Parameters gamma = .9 n_env = 5 size = p['size'] env_distribution = make_env_distribution(env_class='tight', n_env=n_env, gamma=gamma, env_name=name, version=p['version'], w=size, h=size, stochastic=p['stochastic'], verbose=False) actions = env_distribution.get_actions() n_known = p['n_known'] p_min = 1. / n_env epsilon_q = .01 epsilon_m = .01 delta = .1 r_max = 1. v_max = 10. n_states = 4 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmax_p01 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(0.1)') lrmax_p015 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.15, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(0.15)') lrmax_p02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinit_p01 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.1)') lrmaxqinit_p015 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.15, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.15)') lrmaxqinit_p02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.2)') # agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p015, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p015, lrmaxqinit_p02] agents_pool = [ rmax, lrmax, lrmax_p02, lrmax_p01, maxqinit, lrmaxqinit, lrmaxqinit_p01 ] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=2, n_tasks=p['n_tasks'], n_episodes=p['n_episodes'], n_steps=p['n_steps'], reset_at_terminal=False, open_plot=False, plot_title=False, plot_legend=2, do_run=True, do_plot=True, parallel_run=True, n_processes=None, episodes_moving_average=True, episodes_ma_width=100, tasks_moving_average=False, latex_rendering=True)
def experiment(): # Parameters gamma = .9 n_env = 5 w, h = 20, 20 n_states = w * h env_distribution = make_env_distribution( env_class='grid-world', env_name='grid-world-two-goals-large', n_env=n_env, gamma=gamma, w=w, h=h) actions = env_distribution.get_actions() n_known = 1 p_min = 1. / float(n_env) r_max = 1. v_max = 10. epsilon_q = .01 epsilon_m = .01 delta = .1 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=False, min_sampling_probability=p_min, name='LRMax(Dmax0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinitprior02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax0.2)') agents_pool = [ rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02 ] # Run run_agents_lifelong(agents_pool, env_distribution, name_identifier=None, n_instances=1, n_tasks=100, n_episodes=100, n_steps=13, reset_at_terminal=False, open_plot=False, plot_title=True, do_run=True, do_plot=True, parallel_run=True, n_processes=None)
def act(self, s, r): self.time_step += 1 return LRMax.act(self, s, r)
def reset(self): self.time_step = 0 self.time_step_counter = [] LRMax.reset(self)