def update_upper_bound(self): """ Update the total upper bound on the Q-value function. Called at initialization and when a new state-action pair is known. :return: None """ self.update_lipschitz_upper_bounds() self.initialize_upper_bound() RMax.update_upper_bound(self)
def reset(self): """ Reset the attributes to initial state (called between instances). :return: None """ self.update_memory() RMax.reset(self) if len(self.U_memory) > self.n_required_tasks: self.update_max_q_init_upper_bound()
def __init__(self, actions, gamma=0.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, min_sampling_probability=0.1, name="MaxQInit"): """ :param actions: action space of the environment :param gamma: (float) discount factor :param r_max: (float) known upper-bound on the reward function :param v_max: (float) known upper-bound on the value function :param deduce_v_max: (bool) set to True to deduce v_max from r_max :param n_known: (int) count after which a state-action pair is considered known (only set n_known if delta and epsilon are not defined) :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m) :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation :param epsilon_m: (float) precision of the learned models in L1 norm :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta :param n_states: (int) number of states :param min_sampling_probability: (float) minimum sampling probability of an environment :param name: (str) """ RMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, name=name) self.min_sampling_probability = min_sampling_probability self.SA_memory = defaultdict(lambda: defaultdict(lambda: False)) self.U_memory = [] # Upper-bounds on the Q-values of previous MDPs self.n_required_tasks = number_of_tasks_for_high_confidence_upper_bound( delta, min_sampling_probability)
def reset(self): """ Reset the attributes to initial state (called between instances). Save the previous model. :return: None """ RMax.reset(self) self.write(init=False) # Reset recorded variables between MDPs self.discounted_return = 0. self.total_return = 0. self.n_time_steps = 0 self.update_time_steps = []
def reset(self): """ Reset the attributes to initial state (called between instances). Save the previous model. :return: None """ # Save previously learned model if len(self.counter) > 0 and (self.max_memory_size is None or len(self.U_lip) < self.max_memory_size): self.update_memory() RMax.reset(self) if self.estimate_distances_online: self.update_max_distances() self.update_upper_bound()
def experiment(p): # Parameters gamma = .9 n_env = 5 size = p['size'] env_distribution = make_env_distribution( env_class='tight', n_env=n_env, gamma=gamma, env_name=p['name'], w=size, h=size, stochastic=p['stochastic'] ) actions = env_distribution.get_actions() n_known = p['n_known'] p_min = 1. / float(n_env) epsilon_q = .01 epsilon_m = .01 delta = .1 r_max = 1. v_max = p['v_max'] n_states = 4 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmax_p01 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.1)') lrmax_p02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinit_p01 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)') lrmaxqinit_p02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.2)') agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p02] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=3, n_tasks=p['n_tasks'], n_episodes=p['n_episodes'], n_steps=p['n_steps'], reset_at_terminal=False, open_plot=False, plot_title=True, do_run=True, do_plot=True, parallel_run=True, n_processes=None)
def experiment(): n_env = 5 env_distribution = make_env_distribution(env_class='maze-mono-goal', env_name='maze-mono-goal', n_env=n_env, gamma=GAMMA) actions = env_distribution.get_actions() p_min = 1. / float(n_env) delta = .1 m = 100 max_mem = 10 rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m) rmax_q = MaxQInit(actions=actions, gamma=GAMMA, count_threshold=m, min_sampling_probability=p_min, delta=delta) lrmax1 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=1.) lrmax05 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.5) lrmax02 = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.2) lrmax_learn = LRMax( actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=None, min_sampling_probability=p_min, delta=delta ) agents_pool = [rmax, lrmax1, lrmax05, lrmax02, lrmax_learn, rmax_q] run_agents_lifelong( agents_pool, env_distribution, samples=20, episodes=100, steps=1000, reset_at_terminal=False, open_plot=True, cumulative_plot=False, is_tracked_value_discounted=True, plot_only=False, plot_title=False )
def experiment(): n_env = 5 env_distribution = make_env_distribution(env_class='corridor', n_env=n_env, gamma=GAMMA, w=20, h=1) actions = env_distribution.get_actions() p_min = 1. / float(n_env) delta = .1 m = 1 max_mem = 2 rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m) rmax_q = MaxQInit(actions=actions, gamma=GAMMA, count_threshold=m, min_sampling_probability=p_min, delta=delta) lrmax0_2 = LRMaxCT(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.2) lrmax0_6 = LRMaxCT(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=0.6) lrmax1_0 = LRMaxCT(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=1.0) lrmax_learn = LRMaxCT(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=None, min_sampling_probability=p_min, delta=delta) agents_pool = [rmax, lrmax1_0, lrmax0_6, lrmax0_2, lrmax_learn, rmax_q] run_agents_lifelong(agents_pool, env_distribution, samples=20, episodes=20, steps=10, reset_at_terminal=False, open_plot=True, cumulative_plot=False, is_tracked_value_discounted=False, plot_only=False, plot_title=False)
def __init__(self, actions, gamma=.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, name="ExpRMax", path='results/'): RMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, name=name) # Recorded variables self.discounted_return = 0. self.total_return = 0. self.n_time_steps = 0 # nb of time steps self.update_time_steps = [] # time steps where a model update occurred self.path = path self.instance_number = 0 self.run_number = 0
def experiment(): # Parameters gamma = .9 env_distribution = make_env_distribution(env_class='deterministic-super-tight', env_name='deterministic-super-tight-bignknown', gamma=gamma) actions = env_distribution.get_actions() n_known = 100 p_min = 1. / 3. epsilon_q = .01 epsilon_m = .01 delta = .1 r_max = 1. v_max = 1. n_states = 4 max_mem = 9 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.1)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinitprior = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax=0.1)') agents_pool = [rmax, lrmax, lrmaxprior, maxqinit, lrmaxqinit, lrmaxqinitprior] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=1, n_tasks=100, n_episodes=200, n_steps=100, reset_at_terminal=False, open_plot=False, plot_title=True, do_run=False, do_plot=True, parallel_run=True, n_processes=None)
def main(): # Setup MDP. w = 6 h = 6 mdp = GridWorld(width=w, height=h, init_loc=(1, 1), goal_locs=[(6, 6)], slip_prob=.1) # Setup Agents. rand_agent = RandomAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) # Compute number of samples for R-MAX to achieve epsilon optimal behavior with high probability (1 - delta) compute_n_samples = False if compute_n_samples: epsilon = .1 delta = .05 m_r = np.log(2. / delta) / (2. * epsilon**2) m_t = 2. * (np.log(2**(float(w * h)) - 2.) - np.log(delta)) / (epsilon **2) n_samples = int(max(m_r, m_t)) else: n_samples = 30 simple_rl_rmax_agent = RMaxAgent(actions=mdp.get_actions(), gamma=.9, horizon=3, s_a_threshold=n_samples, name='SimpleRL-R-MAX') rmax_agent = RMax(actions=mdp.get_actions(), gamma=.9, count_threshold=n_samples) # Run experiment and make plot. run_agents_on_mdp([rand_agent, ql_agent, rmax_agent, simple_rl_rmax_agent], mdp, instances=5, episodes=100, steps=20, reset_at_terminal=True, verbose=False)
def experiment(): # Parameters gamma = .9 env_distribution = make_env_distribution(env_class='stochastic-tight', env_name='stochastic-tight', gamma=gamma) actions = env_distribution.get_actions() n_known = 10 p_min = 1. / 7. # There are seven possible MDPs epsilon_q = .1 epsilon_m = .01 delta = .1 r_max = 1. v_max = 1. n_states = 4 max_mem = 10 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(Dmax=0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') agents_pool = [rmax, lrmax, lrmaxprior, maxqinit] # , lrmaxqinit] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=5, n_tasks=50, n_episodes=50, n_steps=100, reset_at_terminal=False, plot_only=False, open_plot=True, plot_title=True)
def example(): n_env = 4 env_distribution = make_env_distribution(env_class='test', n_env=n_env, gamma=GAMMA, w=60, h=20) actions = env_distribution.get_actions() m = 1 # Count threshold max_mem = None p_min = 1. / float(n_env) delta = 0.99 lrmax = LRMax(actions=actions, gamma=GAMMA, count_threshold=m, max_memory_size=max_mem, prior=None, min_sampling_probability=p_min, delta=delta) rmax_max_q_init = MaxQInit(actions=actions, gamma=GAMMA, count_threshold=m, min_sampling_probability=p_min, delta=delta) rmax = RMax(actions=actions, gamma=GAMMA, count_threshold=m) run_agents_lifelong([rmax_max_q_init, lrmax, rmax], env_distribution, samples=10, episodes=10, steps=100, reset_at_terminal=False, open_plot=True, cumulative_plot=False, is_tracked_value_discounted=True, plot_only=False)
def experiment(): # Parameters gamma = .9 n_env = 5 n_states = 20 env_distribution = make_env_distribution(env_class='corridor', n_env=n_env, gamma=gamma, w=n_states, h=1) actions = env_distribution.get_actions() n_known = 1 p_min = 1. / float(n_env) r_max = 1. v_max = 10. epsilon_q = .01 epsilon_m = .01 delta = .1 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=False, min_sampling_probability=p_min, name='LRMax(0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinitprior02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.2)') agents_pool = [ rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02 ] # Run run_agents_lifelong(agents_pool, env_distribution, name_identifier=None, n_instances=1, n_tasks=20, n_episodes=20, n_steps=11, reset_at_terminal=False, do_run=False, do_plot=True, open_plot=False, episodes_moving_average=False, episodes_ma_width=10, tasks_moving_average=False, tasks_ma_width=10, latex_rendering=True, plot_title=False)
def __init__(self, actions, gamma=.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, max_memory_size=None, prior=None, estimate_distances_online=True, min_sampling_probability=.1, name="LRMax"): """ :param actions: action space of the environment :param gamma: (float) discount factor :param r_max: (float) known upper-bound on the reward function :param v_max: (float) known upper-bound on the value function :param deduce_v_max: (bool) set to True to deduce v_max from r_max :param n_known: (int) count after which a state-action pair is considered known :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m) (only set n_known if delta and epsilon are not defined) :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation :param epsilon_m: (float) precision of the learned models in L1 norm :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta :param n_states: (int) number of states :param max_memory_size: (int) maximum number of saved models (infinity if None) :param prior: (float) prior knowledge of maximum model's distance :param estimate_distances_online: (bool) set to True for online estimation of a tighter upper-bound for the model pseudo-distances. The estimation is valid with high probability. :param min_sampling_probability: (float) minimum sampling probability of an environment :param name: (str) """ self.name = name RMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, name=name) # Lifelong Learning memories self.max_memory_size = max_memory_size self.U_memory = [] self.R_memory = [] self.T_memory = [] self.SA_memory = defaultdict(lambda: defaultdict(lambda: False)) self.U_lip = [] self.b = self.epsilon_m * (1. + self.gamma * self.v_max) # Prior knowledge on maximum model distance prior_max = self.r_max + self.gamma * 2. * self.v_max self.prior = prior_max if prior is None else min(prior, prior_max) self.prior = round(self.prior, 2) # Online distances estimation self.estimate_distances_online = estimate_distances_online self.min_sampling_probability = min_sampling_probability self.D = defaultdict(lambda: defaultdict(lambda: prior_max) ) # Dictionary of distances (high probability) self.n_samples_high_confidence = compute_n_samples_high_confidence( min_sampling_probability, delta) self.update_upper_bound()
def experiment(): # Parameters gamma = .9 n_env = 5 w, h = 20, 20 n_states = w * h env_distribution = make_env_distribution( env_class='grid-world', env_name='grid-world-two-goals-large', n_env=n_env, gamma=gamma, w=w, h=h) actions = env_distribution.get_actions() n_known = 1 p_min = 1. / float(n_env) r_max = 1. v_max = 10. epsilon_q = .01 epsilon_m = .01 delta = .1 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmaxprior02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=False, min_sampling_probability=p_min, name='LRMax(Dmax0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinitprior02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(Dmax0.2)') agents_pool = [ rmax, lrmax, lrmaxprior02, maxqinit, lrmaxqinit, lrmaxqinitprior02 ] # Run run_agents_lifelong(agents_pool, env_distribution, name_identifier=None, n_instances=1, n_tasks=100, n_episodes=100, n_steps=13, reset_at_terminal=False, open_plot=False, plot_title=True, do_run=True, do_plot=True, parallel_run=True, n_processes=None)
def experiment(p, name): # Parameters gamma = .9 n_env = 5 size = p['size'] env_distribution = make_env_distribution(env_class='tight', n_env=n_env, gamma=gamma, env_name=name, version=p['version'], w=size, h=size, stochastic=p['stochastic'], verbose=False) actions = env_distribution.get_actions() n_known = p['n_known'] p_min = 1. / n_env epsilon_q = .01 epsilon_m = .01 delta = .1 r_max = 1. v_max = 10. n_states = 4 max_mem = 1 # Agents rmax = RMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, name='RMax') lrmax = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax') lrmax_p01 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(0.1)') lrmax_p015 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.15, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(0.15)') lrmax_p02 = LRMax(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMax(0.2)') maxqinit = MaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, min_sampling_probability=p_min, name='MaxQInit') lrmaxqinit = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=None, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit') lrmaxqinit_p01 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.1, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.1)') lrmaxqinit_p015 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.15, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.15)') lrmaxqinit_p02 = LRMaxQInit(actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=False, n_known=n_known, deduce_n_known=False, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, max_memory_size=max_mem, prior=0.2, estimate_distances_online=True, min_sampling_probability=p_min, name='LRMaxQInit(0.2)') # agents_pool = [rmax, lrmax, lrmax_p01, lrmax_p015, lrmax_p02, maxqinit, lrmaxqinit, lrmaxqinit_p01, lrmaxqinit_p015, lrmaxqinit_p02] agents_pool = [ rmax, lrmax, lrmax_p02, lrmax_p01, maxqinit, lrmaxqinit, lrmaxqinit_p01 ] # Run run_agents_lifelong(agents_pool, env_distribution, n_instances=2, n_tasks=p['n_tasks'], n_episodes=p['n_episodes'], n_steps=p['n_steps'], reset_at_terminal=False, open_plot=False, plot_title=False, plot_legend=2, do_run=True, do_plot=True, parallel_run=True, n_processes=None, episodes_moving_average=True, episodes_ma_width=100, tasks_moving_average=False, latex_rendering=True)