def test_indexing(self): hovership_params = {'shape': (100, 5)} env = Hovership(random_start=True, dynamics_parameters=hovership_params) x_seed = np.array([1., 1.]) y_seed = np.array([1]) hyperparameters = { 'outputscale_prior': (1, 0.1), 'lengthscale_prior': (0.2, 0.05), 'noise_prior': (0.001, 0.001) } gpqlearning = GPQLearning(env, 0.9, 0.9, x_seed=x_seed, y_seed=y_seed, gp_params=hyperparameters) query = gpqlearning._get_query_from_index( (np.array([0.5]), slice(None, None, None))) self.assertEqual(query.shape, (5, 2)) self.assertTrue(np.all(query[:, 0] == 0.5)) pred = gpqlearning.gp.predict(query).mean.cpu().numpy() self.assertEqual(pred.shape, (5, ))
def test_policy_convergence(self): hovership_params = {'shape': (100, 2)} env = Hovership(random_start=True, dynamics_parameters=hovership_params) hyperparameters = { 'outputscale_prior': (1, 0.1), 'lengthscale_prior': (0.2, 0.05), 'noise_prior': (0.001, 0.001) } x_seed = np.array([0.85, 1.]) y_seed = np.array([1.]) gpqlearning = GPQLearning(env, 0.9, 0.9, x_seed=x_seed, y_seed=y_seed, gp_params=hyperparameters) nA = env.action_space.index_shape[0] eps = 0.1 for episode in range(3): state = env.reset() failed = env.has_failed n_steps = 0 while not failed and n_steps < 50: probas = np.ones(nA) * eps / nA probas[np.argmax(gpqlearning[state, :])] += 1 - eps action = env.action_space[np.random.choice(nA, p=probas)] new_state, reward, failed = env.step(action) print(f'Step {n_steps} - State {state} - New state {new_state}' f' - Action - {action} - Reward {reward} - Failed ' f'{failed}') gpqlearning.update(state, action, new_state, reward, failed) state = new_state n_steps += 1 def policy_from_gpq(gpq): q_values = gpq[:, :].reshape(gpq.env.stateaction_space.index_shape) policy = np.zeros_like(q_values) for i, _ in iter(env.state_space): policy[i, np.argmax(q_values[i, :])] = 1 return policy policy = policy_from_gpq(gpqlearning) print("The computation of the policy works, but " "the convergence value is not tested. " f"Policy:\n{policy}") self.assertTrue(True)
def load_models(self, skip_local=False): model_name = list(self.get_models_to_save().keys())[0] if not skip_local: load_path = self.local_models_path / model_name else: load_path = self.models_path / model_name self.agent.value_model = GPQLearning.load(load_path)
def load_models(self, skip_local=False): model_name = list(self.get_models_to_save().keys())[0] if not skip_local: load_path = self.local_models_path / model_name else: load_path = self.models_path / model_name self.agent.value_model = GPQLearning.load(load_path, self.env.staetaction_space, self.x_seed, self.y_seed)
def __init__(self, env, greed, step_size, discount_rate, q_x_seed, q_y_seed, gamma_optimistic, gamma_hard, lambda_hard, gamma_soft, s_x_seed, s_y_seed, q_gp_params=None, s_gp_params=None, keep_seed_in_data=True): """ Initializer :param env: the environment :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy policy :param q_step_size: the step size in the Q-Learning update :param discount_rate: the discount rate :param q_x_seed: the seed input of the GP for the Q-Values model :param q_y_seed: the seed output of the GP for the Q-Values model :param gamma_optimistic: the gamma parameter for Q_optimistic :param gamma_hard: the gamma parameter for Q_hard, the set where Q-Learning is constrained (~ Q_cautious) :param lambda_hard: the lambda parameter for Q_hard AND Q_soft :param gamma_soft: the gamma parameter for Q_soft, the set outside of which the safety measure is updated :param s_x_seed: the seed input of the GP for the safety model :param s_y_seed: the seed output of the GP for the safety model :param q_gp_params: the parameters defining the GP for the Q-Values model. See edge.models.inference.MaternGP for more information :param q_gp_params: the parameters defining the GP for the safety model. See edge.models.inference.MaternGP for more information :param keep_seed_in_data: whether to keep the seed data in the GPs datasets. Should be True, otherwise GPyTorch fails. """ Q_model = GPQLearning(env.stateaction_space, step_size, discount_rate, x_seed=q_x_seed, y_seed=q_y_seed, gp_params=q_gp_params) safety_model = MaternSafety(env.stateaction_space, gamma_optimistic, x_seed=s_x_seed, y_seed=s_y_seed, gp_params=s_gp_params) super(SoftHardLearner, self).__init__(env, Q_model, safety_model) self.Q_model = Q_model self.safety_model = safety_model self.lambda_hard = lambda_hard self.gamma_hard = gamma_hard self.gamma_soft = gamma_soft self._gamma_optimistic = gamma_optimistic self.constrained_value_policy = ConstrainedEpsilonGreedy( self.env.stateaction_space, greed) self.safety_maximization_policy = SafetyMaximization( self.env.stateaction_space) self.active_sampling_policy = SafetyActiveSampling( self.env.stateaction_space) self.keep_seed_in_data = keep_seed_in_data if not keep_seed_in_data: self.Q_model.empty_data() self.violated_soft_constraint = None self.updated_safety = None
def __init__(self, env, safety_measure, greed, step_size, discount_rate, safety_threshold, x_seed, y_seed, gp_params=None, keep_seed_in_data=True): """ Initializer :param env: the environment :param safety_measure: either SafetyTruth or SafetyModel of the environment :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy policy :param step_size: the step size in the Q-Learning update :param discount_rate: the discount rate :param safety_threshold: the lambda threshold used to evaluate safety. This is 0 theoretically, but an Agent that is at the exact boundary of the viability kernel still fails due to rounding errors. Hence, this should be a small, positive value. :param x_seed: the seed input of the GP :param y_seed: the seed output of the GP :param gp_params: the parameters defining the GP. See edge.models.inference.MaternGP for more information :param keep_seed_in_data: whether to keep the seed data in the GP dataset. Should be True, otherwise GPyTorch fails. """ Q_model = GPQLearning(env.stateaction_space, step_size, discount_rate, x_seed=x_seed, y_seed=y_seed, gp_params=gp_params) super(ConstrainedQLearner, self).__init__(env, Q_model) self.Q_model = Q_model self.safety_measure = safety_measure self.constrained_value_policy = ConstrainedEpsilonGreedy( self.env.stateaction_space, greed) self.safety_maximization_policy = SafetyMaximization( self.safety_measure.stateaction_space) self.safety_threshold = safety_threshold self.keep_seed_in_data = keep_seed_in_data if not keep_seed_in_data: self.Q_model.empty_data()
def load_models(self, skip_local=False): from edge.model.safety_models import MaternSafety from edge.model.value_models import GPQLearning models_names = list(self.get_models_to_save().keys()) loaders = { 'Q_model': lambda mpath: GPQLearning(mpath, self.env, self.q_x_seed, self. q_y_seed), 'safety_model': lambda mpath: MaternSafety(mpath, self.env, self.gamma_optimistic, self.s_x_seed, self.s_y_seed), } for mname in models_names: if not skip_local: load_path = self.local_models_path / mname else: load_path = self.models_path / mname setattr(self.agent, mname, loaders[mname](load_path))
def __init__(self, env, greed, step_size, discount_rate, x_seed, y_seed, gp_params=None, keep_seed_in_data=True): """ Initializer :param env: the environment :param greed: the epsilon parameter of the EpsilonGreedy policy :param step_size: the step size in the Q-Learning update :param discount_rate: the discount rate :param x_seed: the seed input of the GP :param y_seed: the seed output of the GP :param gp_params: the parameters defining the GP. See edge.models.inference.MaternGP for more information :param keep_seed_in_data: whether to keep the seed data in the GP dataset. Should be True, otherwise GPyTorch fails. """ Q_model = GPQLearning(env.stateaction_space, step_size, discount_rate, x_seed=x_seed, y_seed=y_seed, gp_params=gp_params) super(QLearner, self).__init__(env, Q_model) self.Q_model = Q_model self.policy = EpsilonGreedy(env, greed) self.keep_seed_in_data = keep_seed_in_data if not keep_seed_in_data: self.Q_model.empty_data() self._step_size_decrease_index = 1
def __init__(self, env, greed, step_size, discount_rate, q_x_seed, q_y_seed, gamma_optimistic, gamma_cautious, lambda_cautious, s_x_seed, s_y_seed, q_gp_params=None, s_gp_params=None, keep_seed_in_data=True): """ Initializer :param env: the environment :param greed: the epsilon parameter of the ConstrainedEpsilonGreedy policy :param step_size: the step size in the Q-Learning update :param discount_rate: the discount rate :param q_x_seed: the seed input of the GP for the Q-Values model :param q_y_seed: the seed output of the GP for the Q-Values model :param gamma_optimistic: the gamma parameter for Q_optimistic :param gamma_cautious: the gamma parameter for Q_cautious :param lambda_cautious: the lambda parameter for Q_cautious :param s_x_seed: the seed input of the GP for the safety model :param s_y_seed: the seed output of the GP for the safety model :param q_gp_params: the parameters defining the GP for the Q-Values model. See edge.models.inference.MaternGP for more information :param q_gp_params: the parameters defining the GP for the safety model. See edge.models.inference.MaternGP for more information :param keep_seed_in_data: whether to keep the seed data in the GPs datasets. Should be True, otherwise GPyTorch fails. """ self.lambda_cautious_start, self.lambda_cautious_end = lambda_cautious self.gamma_cautious_start, self.gamma_cautious_end = gamma_cautious self.gamma_optimistic_start, self.gamma_optimistic_end = \ gamma_optimistic self.lambda_cautious = self.lambda_cautious_start self.gamma_cautious = self.gamma_cautious_start self._step_size_decrease_index = 1 Q_model = GPQLearning(env.stateaction_space, step_size, discount_rate, x_seed=q_x_seed, y_seed=q_y_seed, gp_params=q_gp_params) safety_model = MaternSafety(env.stateaction_space, self.gamma_optimistic_start, x_seed=s_x_seed, y_seed=s_y_seed, gp_params=s_gp_params) super(ValuesAndSafetyCombinator, self).__init__( env=env, greed=greed, # Unused: we define another policy step_size=step_size, discount_rate=discount_rate, x_seed=q_x_seed, y_seed=q_y_seed, gp_params=q_gp_params, keep_seed_in_data=keep_seed_in_data) self.Q_model = Q_model self.safety_model = safety_model self.constrained_value_policy = ConstrainedEpsilonGreedy( self.env.stateaction_space, greed) self.safety_maximization_policy = SafetyMaximization( self.env.stateaction_space) self._training_greed = self.greed self.keep_seed_in_data = keep_seed_in_data if not keep_seed_in_data: self.Q_model.empty_data()