def __init__(self, actions, gamma=0.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, min_sampling_probability=0.1, name="MaxQInit"): """ :param actions: action space of the environment :param gamma: (float) discount factor :param r_max: (float) known upper-bound on the reward function :param v_max: (float) known upper-bound on the value function :param deduce_v_max: (bool) set to True to deduce v_max from r_max :param n_known: (int) count after which a state-action pair is considered known (only set n_known if delta and epsilon are not defined) :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m) :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation :param epsilon_m: (float) precision of the learned models in L1 norm :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta :param n_states: (int) number of states :param min_sampling_probability: (float) minimum sampling probability of an environment :param name: (str) """ RMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, name=name) self.min_sampling_probability = min_sampling_probability self.SA_memory = defaultdict(lambda: defaultdict(lambda: False)) self.U_memory = [] # Upper-bounds on the Q-values of previous MDPs self.n_required_tasks = number_of_tasks_for_high_confidence_upper_bound( delta, min_sampling_probability)
def __init__(self, actions, gamma=.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, name="ExpRMax", path='results/'): RMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, name=name) # Recorded variables self.discounted_return = 0. self.total_return = 0. self.n_time_steps = 0 # nb of time steps self.update_time_steps = [] # time steps where a model update occurred self.path = path self.instance_number = 0 self.run_number = 0
def __init__(self, actions, gamma=.9, r_max=1., v_max=None, deduce_v_max=True, n_known=None, deduce_n_known=True, epsilon_q=0.1, epsilon_m=None, delta=None, n_states=None, max_memory_size=None, prior=None, estimate_distances_online=True, min_sampling_probability=.1, name="LRMax"): """ :param actions: action space of the environment :param gamma: (float) discount factor :param r_max: (float) known upper-bound on the reward function :param v_max: (float) known upper-bound on the value function :param deduce_v_max: (bool) set to True to deduce v_max from r_max :param n_known: (int) count after which a state-action pair is considered known :param deduce_n_known: (bool) set to True to deduce n_known from (delta, n_states, epsilon_m) (only set n_known if delta and epsilon are not defined) :param epsilon_q: (float) precision of value iteration algorithm for Q-value computation :param epsilon_m: (float) precision of the learned models in L1 norm :param delta: (float) models are learned epsilon_m-closely with probability at least 1 - delta :param n_states: (int) number of states :param max_memory_size: (int) maximum number of saved models (infinity if None) :param prior: (float) prior knowledge of maximum model's distance :param estimate_distances_online: (bool) set to True for online estimation of a tighter upper-bound for the model pseudo-distances. The estimation is valid with high probability. :param min_sampling_probability: (float) minimum sampling probability of an environment :param name: (str) """ self.name = name RMax.__init__(self, actions=actions, gamma=gamma, r_max=r_max, v_max=v_max, deduce_v_max=deduce_v_max, n_known=n_known, deduce_n_known=deduce_n_known, epsilon_q=epsilon_q, epsilon_m=epsilon_m, delta=delta, n_states=n_states, name=name) # Lifelong Learning memories self.max_memory_size = max_memory_size self.U_memory = [] self.R_memory = [] self.T_memory = [] self.SA_memory = defaultdict(lambda: defaultdict(lambda: False)) self.U_lip = [] self.b = self.epsilon_m * (1. + self.gamma * self.v_max) # Prior knowledge on maximum model distance prior_max = self.r_max + self.gamma * 2. * self.v_max self.prior = prior_max if prior is None else min(prior, prior_max) self.prior = round(self.prior, 2) # Online distances estimation self.estimate_distances_online = estimate_distances_online self.min_sampling_probability = min_sampling_probability self.D = defaultdict(lambda: defaultdict(lambda: prior_max) ) # Dictionary of distances (high probability) self.n_samples_high_confidence = compute_n_samples_high_confidence( min_sampling_probability, delta) self.update_upper_bound()