def __init__(self, name, max_samples, greed, step_size, discount_rate, penalty_level, x_seed, y_seed, shape, every): dynamics_parameters = {'shape': shape} self.env = PenalizedHovership(penalty_level=penalty_level, dynamics_parameters=dynamics_parameters) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'hover_map.pickle') self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.02, 0.02), 'noise_prior': (0.001, 0.002) } self.x_seed = x_seed self.y_seed = y_seed self.agent = QLearner(self.env, greed, step_size, discount_rate, x_seed=self.x_seed, y_seed=self.y_seed, gp_params=self.hyperparameters) plotters = {'Q-Values': QValuePlotter(self.agent, self.ground_truth)} output_directory = Path(__file__).parent.resolve() super(PenalizedSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every
def __init__(self, name, n_episodes, episode_max_steps, discount_rate, step_size, features_function, n_features, initial_weight, initial_var, shape): dynamics_parameters = {'shape': shape} self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters) self.agent = PGOptimizer(env=self.env, discount_rate=discount_rate, step_size=step_size, features_function=features_function, n_features=n_features, initial_weight=initial_weight, initial_var=initial_var) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'slip_map.pickle') plotters = {'Samples': SamplePlotter(self.agent, self.ground_truth)} output_directory = Path(__file__).parent.resolve() super(EpisodicPGSimulation, self).__init__(output_directory, name, plotters) self.n_episodes = n_episodes self.episode_max_steps = episode_max_steps
def __init__(self, name, max_samples, greed, step_size, discount_rate, x_seed, y_seed, shape, every): dynamics_parameters = {'shape': shape} self.env = LowGoalHovership(dynamics_parameters=dynamics_parameters) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'hover_map.pickle') self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.2), 'noise_prior': (0.001, 0.002) } self.agent = ConstrainedQLearner(self.env, self.ground_truth, greed, step_size, discount_rate, safety_threshold=0.05, x_seed=x_seed, y_seed=y_seed, gp_params=self.hyperparameters) plotters = { 'Q-Values': QValuePlotter(self.agent, self.agent.safety_measure) } output_directory = Path(__file__).parent.resolve() super(ConstrainedSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every
def __init__(self, output_directory, name, max_samples=250, gamma_optimistic=0.9, gamma_cautious=0.9, lambda_cautious=0.1, lengthscale_prior=(0.1, 0.05), shape=(10, 10), hyperparameters=None, ground_truth=None, every=50): x_seed = np.array([1.45, 0.5]) y_seed = np.array([1.]) dynamics_parameters = {'shape': shape} self.env = Hovership(random_start=False, dynamics_parameters=dynamics_parameters, default_initial_state=x_seed[:1]) if hyperparameters is None: hyperparameters = {} default_hyperparameters = { 'outputscale_prior': (1, 0.1), 'lengthscale_prior': lengthscale_prior, 'noise_prior': (0.001, 0.001) } default_hyperparameters.update(hyperparameters) hyperparameters = default_hyperparameters if ground_truth is None: self.ground_truth = None else: self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(ground_truth) self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=x_seed, y_seed=y_seed, gp_params=hyperparameters, ) self.agent.reset() plotters = {'Safety': SafetyPlotter(self.agent, self.ground_truth)} super(ToySimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every
def test_from_vibly(self): env = Hovership() truth = SafetyTruth(env) vibly_file_path = '../data/ground_truth/from_vibly/hover_map.pickle' truth.from_vibly_file(vibly_file_path) self.assertTrue(isinstance(truth.stateaction_space, StateActionSpace)) self.assertEqual(truth.viable_set.shape, truth.measure_value.shape) self.assertEqual(truth.viable_set.shape, truth.unviable_set.shape) self.assertEqual(truth.viable_set.shape, truth.failure_set.shape)
def __init__(self, max_samples, gamma_optimistic, gamma_cautious, lambda_cautious, shape, every): self.x_seed = np.array([1.45, 0.5]) self.y_seed = np.array([.8]) dynamics_parameters = { 'shape': shape } self.env = Hovership( random_start=True, dynamics_parameters=dynamics_parameters, default_initial_state=self.x_seed[:1] ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( '../data/ground_truth/from_vibly/hover_map.pickle' ) self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.1, 0.1), 'noise_prior': (0.001, 0.002) } self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=self.x_seed, y_seed=self.y_seed, gp_params=self.hyperparameters, ) plotters = { 'DetailedSafety': DetailedSafetyPlotter(self.agent, self.ground_truth) } super(OptimisticSimulation, self).__init__( 'results', 'optimistic', plotters ) self.max_samples = max_samples self.every = every self.samples_path = self.output_directory / 'samples' self.samples_path.mkdir(parents=True, exist_ok=True) self.model_path = self.output_directory / 'model' self.model_path.mkdir(parents=True, exist_ok=True) failure_indexes = np.argwhere(self.ground_truth.failure_set == 1) self.failure_set = np.array([ self.ground_truth.stateaction_space[tuple(index)] for index in failure_indexes[::3] ])
def __init__(self, output_directory, name, max_samples, gamma_optimistic, gamma_cautious, lambda_cautious, shape, ground_truth, random_start=False, every=50): x_seed = np.array([1.45, 0.5]) y_seed = np.array([.8]) dynamics_parameters = { 'shape': shape } self.env = Hovership( random_start=random_start, dynamics_parameters=dynamics_parameters, default_initial_state=x_seed[:1] ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(ground_truth) self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.2), 'noise_prior': (0.001, 0.002) } self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=x_seed, y_seed=y_seed, gp_params=self.hyperparameters, ) self.agent.reset() plotters = { 'Safety': SafetyPlotter(self.agent, self.ground_truth) } super(HyperparametersSimulation, self).__init__( output_directory, name, plotters ) self.max_samples = max_samples self.every = every self.random_start = random_start
def test_safety_map(self): env = MyDiscreteHovership() safety = SafetyTruth(env) safety.compute() true_safety_map = np.array([[False, False, False, False], [False, False, False, False], [False, False, False, True], [False, True, True, True], [True, True, True, True], [True, True, True, True]]) self.assertTrue( np.all(safety.viable_set == true_safety_map), 'Error: computed Safety map is different from ground truth.\n' f'Computed:\n{safety.viable_set}\nGround truth:\n{true_safety_map}' )
class SafetyTruthComputation(TruthComputationSimulation): def __init__(self, name, env_name, discretization_shape, *args, **kwargs): if env_name == 'cartpole': env_builder = ContinuousCartPole else: raise ValueError(f'Environment {env_name} is not supported') output_directory = Path(__file__).parent.resolve() super(SafetyTruthComputation, self).__init__(output_directory, name, safety_name(env_name)) self.env = env_builder(discretization_shape=discretization_shape, *args, **kwargs) self.truth = SafetyTruth(self.env) self.Q_map_path = self.output_directory / (str(Q_map_name(env_name)) + '.npy') self.save_path = self.output_directory / safety_name(env_name) logger.info(config_msg(f"env_name='{env_name}'")) logger.info( config_msg(f"discretization_shape='{discretization_shape}'")) logger.info((config_msg(f"args={args}"))) logger.info((config_msg(f"kwargs={kwargs}"))) def run(self): logger.info('Launched computation of viable set') if not self.Q_map_path.exists(): errormsg = f'The transition map could not be found at ' \ f'{str(self.Q_map_path)}. Please compute it first.' logger.critical(errormsg) raise FileNotFoundError(errormsg) tick = time.time() self.truth.compute(self.Q_map_path) tock = time.time() logger.info(f'Done in {tock - tick:.2f} s.') self.truth.save(str(self.save_path)) logger.info(f'Output saved in {str(self.save_path)}')
def __init__(self, name, env_name, discretization_shape, *args, **kwargs): if env_name == 'cartpole': env_builder = ContinuousCartPole else: raise ValueError(f'Environment {env_name} is not supported') output_directory = Path(__file__).parent.resolve() super(SafetyTruthComputation, self).__init__(output_directory, name, safety_name(env_name)) self.env = env_builder(discretization_shape=discretization_shape, *args, **kwargs) self.truth = SafetyTruth(self.env) self.Q_map_path = self.output_directory / (str(Q_map_name(env_name)) + '.npy') self.save_path = self.output_directory / safety_name(env_name) logger.info(config_msg(f"env_name='{env_name}'")) logger.info( config_msg(f"discretization_shape='{discretization_shape}'")) logger.info((config_msg(f"args={args}"))) logger.info((config_msg(f"kwargs={kwargs}")))
def get_ground_truth(self): self.ground_truth_path = self.local_models_path / 'safety_ground_truth.npz' load = self.ground_truth_path.exists() if load: try: ground_truth = SafetyTruth.load(self.ground_truth_path, self.env) except ValueError: load = False if not load: ground_truth = SafetyTruth(self.env) ground_truth.compute() ground_truth.save(self.ground_truth_path) return ground_truth
def test_get_training_examples(self): env = Hovership() truth = SafetyTruth(env) vibly_file_path = '../data/ground_truth/from_vibly/hover_map.pickle' truth.from_vibly_file(vibly_file_path) train_x, train_y = truth.get_training_examples(n_examples=2000) self.assertEqual(train_x.shape[0], train_y.shape[0]) self.assertEqual(train_x.shape[0], 2000) self.assertEqual(train_x.shape[1], truth.stateaction_space.index_dim) train_x, train_y = truth.get_training_examples(n_examples=2000, from_failure=True, viable_proportion=0.6) self.assertEqual(train_x.shape[0], train_y.shape[0]) self.assertEqual(train_x.shape[0], 2000) self.assertEqual(train_x.shape[1], truth.stateaction_space.index_dim) self.assertTrue((train_y[:1200] > 0).all()) self.assertTrue((train_y[1200:] == 0).all())
def __init__(self, output_directory, name, envname, aname, envparams, aparams, n_episodes, glie_start, safety_parameters_update_end, reset_in_safe_state, metrics_sampling_frequency, n_episodes_in_measurement, plot_every, seed): self.env = ENV_CONSTRUCTOR[envname](**envparams) self.agent = AGENT_CONSTRUCTOR[aname](env=self.env, **aparams) safety_truth_path = SAFETY_TRUTH_PATH[envname] if envname in SAFETY_TRUTH_FROM_VIBLY: self.safety_truth = SafetyTruth(self.env) self.safety_truth.from_vibly_file(safety_truth_path) else: self.safety_truth = SafetyTruth.load(safety_truth_path, self.env) self.n_episodes = n_episodes self.glie_start = glie_start if not isinstance(glie_start, float) else \ int(glie_start * self.n_episodes) if safety_parameters_update_end is not None: if isinstance(safety_parameters_update_end, float): update_end = int(safety_parameters_update_end * n_episodes) self.safety_parameters_update_end = update_end else: self.safety_parameters_update_end = safety_parameters_update_end else: self.safety_parameters_update_end = n_episodes self.reset_in_safe_state = reset_in_safe_state self.metrics_sampling_frequency = metrics_sampling_frequency self.n_episodes_in_measurement = n_episodes_in_measurement self.plot_every = plot_every self.agent_has_safety_model = aname in HAS_SAFETY_MODEL self.METRICS_NAMES = BenchmarkSingleSimulation.METRICS_BASE_NAMES if self.agent_has_safety_model: self.METRICS_NAMES += [ BenchmarkSingleSimulation.Q_C_Q_V_MNAME, BenchmarkSingleSimulation.Q_V_Q_C_MNAME ] plotters = {} if envname in PLOTTABLE_Q: if self.agent_has_safety_model: plotters.update({ 'Q-Values_Safety': QValueAndSafetyPlotter( self.agent, self.safety_truth, # ensure_in_dataset=True ) }) else: plotters.update({ 'Q-Values': QValuePlotter( self.agent, self.safety_truth, write_values=False, plot_samples=True, ) }) super(BenchmarkSingleSimulation, self).__init__(output_directory, name, plotters) self.set_seed(value=seed) self.metrics_path = self.output_directory / 'metrics' self.metrics = AgentMetrics(*self.METRICS_NAMES) simparams = { 'output_directory': output_directory, 'name': name, 'n_episodes': n_episodes, 'glie_start': glie_start, 'safety_parameters_update_end': safety_parameters_update_end, 'reset_in_safe_state': reset_in_safe_state, 'metrics_sampling_frequency': metrics_sampling_frequency, 'n_episodes_in_measurement': n_episodes_in_measurement, 'plot_every': plot_every, } logger.info(config_msg(f"Setting up simulation {name}")) logger.info(config_msg(f"ENVIRONMENT: {envname}")) logger.info(config_msg(str(envparams))) logger.info(config_msg(f"AGENT: {aname}")) logger.info(config_msg(str(aparams))) logger.info(config_msg("SIMULATION:")) logger.info(config_msg(str(simparams)))
class ToySimulation(Simulation): def __init__(self, output_directory, name, max_samples=250, gamma_optimistic=0.9, gamma_cautious=0.9, lambda_cautious=0.1, lengthscale_prior=(0.1, 0.05), shape=(10, 10), hyperparameters=None, ground_truth=None, every=50): x_seed = np.array([1.45, 0.5]) y_seed = np.array([1.]) dynamics_parameters = {'shape': shape} self.env = Hovership(random_start=False, dynamics_parameters=dynamics_parameters, default_initial_state=x_seed[:1]) if hyperparameters is None: hyperparameters = {} default_hyperparameters = { 'outputscale_prior': (1, 0.1), 'lengthscale_prior': lengthscale_prior, 'noise_prior': (0.001, 0.001) } default_hyperparameters.update(hyperparameters) hyperparameters = default_hyperparameters if ground_truth is None: self.ground_truth = None else: self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(ground_truth) self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=x_seed, y_seed=y_seed, gp_params=hyperparameters, ) self.agent.reset() plotters = {'Safety': SafetyPlotter(self.agent, self.ground_truth)} super(ToySimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every def run(self): n_samples = 0 while n_samples < self.max_samples: self.agent.reset() failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}') n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed, _ = self.agent.step() action = self.agent.last_action print(f'Step {n_samples}/{self.max_samples} - {old_state} ' f' -> {action} -> {new_state} ({failed})') self.on_run_iteration(old_state, action, new_state, reward, failed) if n_samples >= self.max_samples: break self.compile_gif()
gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious ) elif args.nominal == RANDOM: agent = RandomSafetyLearner.load( env=env, mpath=apath, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious ) else: raise ValueError truth_path = here.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / \ 'hover_map.pickle' ground_truth = SafetyTruth(env) ground_truth.from_vibly_file(truth_path) dataset_path = here / f'{args.nominal}_controller' / 'data' / 'train.csv' dataset = Dataset.load(dataset_path, group_name='Training') print(f"EVALUATING {args.nominal} AGENT AFTER BATCH #{args.nmodel}") n_samples = len(dataset.loc[dataset.df['Training'] <= args.nmodel]) print(f'Number of training samples: {n_samples}') optimistic_qv_ratio = learned_qv(agent, ground_truth, cautious=False) print(f"Q_opt / Q_V ratio: {optimistic_qv_ratio*100:.3f} %") cautious_qv_ratio = learned_qv(agent, ground_truth, cautious=True) print(f"Q_caut / Q_V ratio: {cautious_qv_ratio*100:.3f} %") if args.nominal == AFFINE: mean_diff, inf_diff = difference(agent, ground_truth) print(f"L2 difference with optimal controller (state average): "
def __init__(self, name, max_samples, greed, step_size, discount_rate, gamma_optimistic, gamma_cautious, lambda_cautious, q_x_seed, q_y_seed, s_x_seed, s_y_seed, shape, every, glie_start, s_epochs): self.s_epochs = s_epochs dynamics_parameters = { 'shape': shape } self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters) self.q_hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.05, 0.1), 'noise_prior': (0.001, 0.002) } self.s_hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.1), 'noise_prior': (0.001, 0.002) } self.q_x_seed = q_x_seed self.q_y_seed = q_y_seed self.s_x_seed = s_x_seed self.s_y_seed = s_y_seed self.gamma_optimistic_start, self.gamma_optimistic_end = identity_or_duplicated_value(gamma_optimistic) self.gamma_cautious_start, self.gamma_cautious_end = identity_or_duplicated_value(gamma_cautious) self.lambda_cautious_start, self.lambda_cautious_end = identity_or_duplicated_value(lambda_cautious) self.gamma_optimistic = self.gamma_optimistic_start self.gamma_cautious = self.gamma_cautious_start self.lambda_cautious = self.lambda_cautious_start self.agent = EpsCorlLearner( self.env, greed=greed, step_size=step_size, discount_rate=discount_rate, q_x_seed=self.q_x_seed, q_y_seed=self.q_y_seed, gamma_optimistic=self.gamma_optimistic, gamma_cautious=self.gamma_cautious, lambda_cautious=self.lambda_cautious, s_x_seed=s_x_seed, s_y_seed=s_y_seed, q_gp_params=self.q_hyperparameters, s_gp_params=self.s_hyperparameters, ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'slip_map.pickle' ) plotters = { 'Q-Values_Safety': QValueAndSafetyPlotter(self.agent, self.ground_truth) } # plotters = {} output_directory = Path(__file__).parent.resolve() super(EpsCorlSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every if isinstance(glie_start, float): self.glie_start = int(glie_start * self.max_samples) else: self.glie_start = glie_start
class EpsCorlSimulation(ModelLearningSimulation): def __init__(self, name, max_samples, greed, step_size, discount_rate, gamma_optimistic, gamma_cautious, lambda_cautious, q_x_seed, q_y_seed, s_x_seed, s_y_seed, shape, every, glie_start, s_epochs): self.s_epochs = s_epochs dynamics_parameters = { 'shape': shape } self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters) self.q_hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.05, 0.1), 'noise_prior': (0.001, 0.002) } self.s_hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.1), 'noise_prior': (0.001, 0.002) } self.q_x_seed = q_x_seed self.q_y_seed = q_y_seed self.s_x_seed = s_x_seed self.s_y_seed = s_y_seed self.gamma_optimistic_start, self.gamma_optimistic_end = identity_or_duplicated_value(gamma_optimistic) self.gamma_cautious_start, self.gamma_cautious_end = identity_or_duplicated_value(gamma_cautious) self.lambda_cautious_start, self.lambda_cautious_end = identity_or_duplicated_value(lambda_cautious) self.gamma_optimistic = self.gamma_optimistic_start self.gamma_cautious = self.gamma_cautious_start self.lambda_cautious = self.lambda_cautious_start self.agent = EpsCorlLearner( self.env, greed=greed, step_size=step_size, discount_rate=discount_rate, q_x_seed=self.q_x_seed, q_y_seed=self.q_y_seed, gamma_optimistic=self.gamma_optimistic, gamma_cautious=self.gamma_cautious, lambda_cautious=self.lambda_cautious, s_x_seed=s_x_seed, s_y_seed=s_y_seed, q_gp_params=self.q_hyperparameters, s_gp_params=self.s_hyperparameters, ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'slip_map.pickle' ) plotters = { 'Q-Values_Safety': QValueAndSafetyPlotter(self.agent, self.ground_truth) } # plotters = {} output_directory = Path(__file__).parent.resolve() super(EpsCorlSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every if isinstance(glie_start, float): self.glie_start = int(glie_start * self.max_samples) else: self.glie_start = glie_start def get_models_to_save(self): # The keys must be the same as the actual names of the attributes, this is used in load_models. # This is hacky and should be replaced return { 'Q_model': self.agent.Q_model, 'safety_model': self.agent.safety_model } def load_models(self, skip_local=False): from edge.model.safety_models import MaternSafety from edge.model.value_models import GPQLearning models_names = list(self.get_models_to_save().keys()) loaders= { 'Q_model': lambda mpath: GPQLearning(mpath, self.env, self.q_x_seed, self.q_y_seed), 'safety_model': lambda mpath: MaternSafety(mpath, self.env, self.gamma_optimistic, self.s_x_seed, self.s_y_seed), } for mname in models_names: if not skip_local: load_path = self.local_models_path / mname else: load_path = self.models_path / mname setattr( self.agent, mname, loaders[mname](load_path) ) def run(self): n_samples = 0 self.save_figs(prefix='0') # train hyperparameters print('Optimizing hyperparameters...') s_train_x, s_train_y = self.ground_truth.get_training_examples() self.agent.fit_models( s_epochs=self.s_epochs, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.1} ) self.agent.fit_models( s_epochs=self.s_epochs, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.01} ) self.agent.fit_models( s_epochs=self.s_epochs, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.001} ) print('Lengthscale:',self.agent.safety_model.gp.covar_module.base_kernel.lengthscale) print('Outputscale:',self.agent.safety_model.gp.covar_module.outputscale) print('Done.') print('Training...') while n_samples < self.max_samples: reset_state = self.agent.get_random_safe_state() self.agent.reset(reset_state) failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed = self.agent.step() action = self.agent.last_action # * start reducing eps to converge to a greedy policy. if self.glie_start is not None and n_samples > self.glie_start: self.agent.greed *= (n_samples - self.glie_start) / ( (n_samples - self.glie_start + 1)) self.agent.gamma_optimistic = affine_interpolation( n_samples / self.max_samples, self.gamma_optimistic_start, self.gamma_optimistic_end ) self.agent.gamma_cautious = affine_interpolation( n_samples / self.max_samples, self.gamma_cautious_start, self.gamma_cautious_end ) self.agent.lambda_cautious = affine_interpolation( n_samples / self.max_samples, self.lambda_cautious_start, self.lambda_cautious_end ) color = None if not self.agent.has_explored else [0.3, 0.3, 0.9] self.on_run_iteration(n_samples, old_state, action, new_state, reward, failed, color=color) if n_samples >= self.max_samples: break self.agent.reset() print('Done.') self.save_figs(prefix=f'{self.name}_final') self.compile_gif() def on_run_iteration(self, n_samples, *args, **kwargs): super(EpsCorlSimulation, self).on_run_iteration(*args, **kwargs) print(f'Iteration {n_samples}/{self.max_samples}') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}')
class FixedControllerLowdim(ModelLearningSimulation): @log_simulation_parameters def __init__(self, name, shape, gamma_cautious, lambda_cautious, gamma_optimistic, controller, reset_in_safe_state, n_episodes_train, n_episodes_test, n_train_test, plot_every=1): shapedict = {} if shape is None else {'shape': shape} self.env = LowGoalHovership( goal_state=False, initial_state=np.array([1.3]), **shapedict # This matters for the GP ) x_seed = np.array([[2, .1]]) y_seed = np.array([.5]) lengthscale_means = (0.2, 0.2) lengthscale_vars = (0.1, 0.1) lengthscale_prior = tuple(zip(lengthscale_means, lengthscale_vars)) outputscale_prior = (1., 10.) noise_prior = (0.007, 0.1) gp_params = { 'train_x': x_seed, 'train_y': y_seed, 'outputscale_prior': outputscale_prior, 'lengthscale_prior': lengthscale_prior, 'noise_prior': noise_prior, 'mean_constant': None, 'dataset_type': None, 'dataset_params': None, # Other possible options: # 'dataset_type': 'downsampling', # 'dataset_params': {'append_every': 10}, # 'dataset_type': 'neighborerasing', # 'dataset_params': {'radius': 0.01}, 'value_structure_discount_factor': None, } if controller == 'random': agent = RandomSafetyLearner( env=self.env, s_gp_params=gp_params.copy(), gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, gamma_optimistic=gamma_optimistic, ) elif controller == 'affine': agent = AffineSafetyLearner( env=self.env, offset=(np.array([2.0]), np.array([0.1])), jacobian=np.array([[(0.7 - 0.1) / (0. - 2.)]]), s_gp_params=gp_params.copy(), gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, gamma_optimistic=gamma_optimistic, ) else: raise ValueError('Invalid controller') self.agent = agent truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / f'hover_map.pickle' self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(truth_path) ctrlr = None if controller == 'random' else self.agent.policy plotters = { 'safety': SafetyPlotter(self.agent, ground_truth=self.ground_truth, controller=ctrlr) } output_directory = Path(__file__).parent.resolve() super().__init__(output_directory, name, plotters) self.reset_in_safe_state = reset_in_safe_state self.n_episodes_train = n_episodes_train self.n_episodes_test = n_episodes_test self.n_train_test = n_train_test self.plot_every = plot_every self.training_dataset = Dataset(*Dataset.DEFAULT_COLUMNS, CTRLR_VIAB, FLWD_CTRLR, group_name=GROUP_NAME, name='train') self.testing_dataset = Dataset(*Dataset.DEFAULT_COLUMNS, SAFETY_NAME, CTRLR_VIAB, FLWD_CTRLR, group_name=GROUP_NAME, name=f'test') def run_episode(self, n_episode, prefix=None): episode = { cname: [] for cname in self.training_dataset.columns_wo_group } done = self.env.done n = 0 if prefix is not None: self.save_figs(prefix=f'{prefix}_{n}') while not done: old_state = self.agent.state new_state, reward, failed, done = self.agent.step() action = self.agent.last_action ctrlr_action = self.agent.last_controller_action ctrlr_viab = self.ground_truth.is_viable(state=old_state, action=ctrlr_action) flwd_ctrlr = self.agent.followed_controller append_to_episode(self.training_dataset, episode, old_state, action, new_state, reward, failed, done, ctrlr_viab, flwd_ctrlr) if self.agent.training_mode: marker = None color = [1, 0, 0 ] if self.agent.followed_controller else [0, 1, 0] super().on_run_iteration(state=old_state, action=action, new_state=new_state, reward=reward, failed=failed, color=color, marker=marker) if prefix is not None: if (n + 1) % self.plot_every == 0: self.save_figs(prefix=f'{prefix}_{n}') n += 1 len_episode = len(episode[self.training_dataset.REWARD]) episode[self.training_dataset.EPISODE] = [n_episode] * len_episode return episode def reset_agent_state(self): if self.reset_in_safe_state: is_viable = self.agent.safety_model.measure( slice(None, None, None), lambda_threshold=self.agent.lambda_cautious, gamma_threshold=self.agent.gamma_cautious) > 0 if any(is_viable): viable_indexes = np.atleast_1d( np.argwhere(is_viable).squeeze()) state_index = viable_indexes[np.random.choice( len(viable_indexes))] s = self.env.stateaction_space.state_space[state_index] self.agent.reset(s) while self.env.done: s = self.agent.reset() return s @timeit def train_agent(self, n_train): self.agent.training_mode = True # self.save_figs(prefix=f'{n_train}ep{0}') for n in range(self.n_episodes_train): self.reset_agent_state() episode = self.run_episode(n, prefix=f'{n_train}ep{n+1}') self.training_dataset.add_group(episode, group_number=n_train) # if (n+1) % self.plot_every == 0: # self.save_figs(prefix=f'{n_train}ep{n+1}') @timeit def test_agent(self, n_test): self.agent.training_mode = False for n in range(self.n_episodes_test): self.reset_agent_state() episode = self.run_episode(n) self.testing_dataset.add_group(episode, group_number=n_test) @timeit def log_performance(self, n_train, ds, name_in_log, duration=None, header=True, limit_episodes=None): df = ds.df if n_train is not None: train = df.loc[df[ds.group_name] == n_train, :] else: train = df r, f, xplo_steps, off_ctrlr = average_performances( train, ds.group_name, ds.EPISODE, limit_episodes) n_steps = len(train) caveat = '' if limit_episodes is None \ else f'(last {limit_episodes} episodes) ' header = '-------- Performance --------\n' if header else '' message = (f'--- {name_in_log} {caveat}\n' f'Average total reward per episode: {r:.3f}\n' f'Average number of failures: {f * 100:.3f} %\n' f'Number of exploration steps: {xplo_steps} / {n_steps}\n' f'Number of off-controller steps: {off_ctrlr} / {n_steps}') if duration is not None: message += f'\nComputation time: {duration:.3f} s' logging.info(header + message) def log_cautious_qv_ratio(self): ratio = cautious_qv(self.agent, self.ground_truth) message = f'Proportion of Q_V labeled as cautious: {ratio*100:.3f} %' logging.info(message) def log_memory(self): if device == cuda: message = ('Memory usage\n' + torch.cuda.memory_summary()) logging.info(message) def log_samples(self): n_samples = self.agent.safety_model.gp.train_x.shape[0] logging.info(f'Training dataset size: {n_samples}') @timeit def checkpoint(self, n): self.training_dataset.save(self.data_path) self.testing_dataset.save(self.data_path) self.save_safety_model(f'safety_model_{n}') def save_safety_model(self, name): savepath = self.local_models_path / 'safety_model' / name savepath.mkdir(exist_ok=True, parents=True) self.agent.safety_model.save(savepath, save_data=True) def get_models_to_save(self): return {'safety_model': self.agent.safety_model} @timeit def run(self): for n in range(self.n_train_test): logging.info(f'========= CYCLE {n+1}/{self.n_train_test} ========') t = 0 if self.n_train_test == 1 else n / (self.n_train_test - 1) self.agent.update_safety_params(t=t) train_t = self.train_agent(n) try: pass except RuntimeError as e: train_t = None logging.critical(f'train_agent({n}) failed:\n{str(e)}') self.log_memory() torch.cuda.empty_cache() finally: self.log_performance(n, self.training_dataset, 'Training', train_t, header=True, limit_episodes=self.n_episodes_train) self.log_samples() try: test_t = self.test_agent(n) except RuntimeError as e: test_t = None logging.critical(f'test_agent({n}) failed:\n{str(e)}') torch.cuda.empty_cache() finally: self.log_performance(n, self.testing_dataset, 'Testing', test_t, header=False, limit_episodes=None) chkpt_t = self.checkpoint(n) logging.info(f'Checkpointing time: {chkpt_t:.3f} s') self.log_performance(None, self.training_dataset, 'Training - Full dataset', duration=None, header=False, limit_episodes=None) self.log_performance(None, self.testing_dataset, 'Testing - Full dataset', duration=None, header=False, limit_episodes=None) self.log_cautious_qv_ratio()
class EpisodicPGSimulation(ModelLearningSimulation): def __init__(self, name, n_episodes, episode_max_steps, discount_rate, step_size, features_function, n_features, initial_weight, initial_var, shape): dynamics_parameters = {'shape': shape} self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters) self.agent = PGOptimizer(env=self.env, discount_rate=discount_rate, step_size=step_size, features_function=features_function, n_features=n_features, initial_weight=initial_weight, initial_var=initial_var) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'slip_map.pickle') plotters = {'Samples': SamplePlotter(self.agent, self.ground_truth)} output_directory = Path(__file__).parent.resolve() super(EpisodicPGSimulation, self).__init__(output_directory, name, plotters) self.n_episodes = n_episodes self.episode_max_steps = episode_max_steps def get_models_to_save(self): return {} # TODO: so far, the models are not saved def load_models(self, skip_local=False): pass # TODO def run_episode(self, n_episode): n_steps = 0 episode = [] while n_steps < self.episode_max_steps: n_steps += 1 old_state = self.agent.state new_state, reward, failed = self.agent.step() action = self.agent.last_action step = { 'state': old_state, 'action': action, 'new_state': new_state, 'reward': reward, 'failed': failed } episode.append(step) self.on_run_iteration(n_episode, n_steps, old_state, action, new_state, reward, failed) if failed: break return episode def run(self): n_episode = 0 self.save_figs(prefix='Ep0') while n_episode < self.n_episodes: n_episode += 1 self.agent.reset(np.array([0.4])) episode = self.run_episode(n_episode) self.agent.update_models(episode) self.save_figs(prefix=f'Ep{n_episode}') self.on_episode_iteration() print('Done.') def on_run_iteration(self, n_episode, n_steps, *args, **kwargs): super(EpisodicPGSimulation, self).on_run_iteration(*args, **kwargs) print(f'Episode {n_episode} - Step {n_steps}') print(self.agent.policy.actions_density) def on_episode_iteration(self): self.plotters['Samples'].flush_samples()
class HyperparametersSimulation(Simulation): def __init__(self, output_directory, name, max_samples, gamma_optimistic, gamma_cautious, lambda_cautious, shape, ground_truth, random_start=False, every=50): x_seed = np.array([1.45, 0.5]) y_seed = np.array([.8]) dynamics_parameters = { 'shape': shape } self.env = Hovership( random_start=random_start, dynamics_parameters=dynamics_parameters, default_initial_state=x_seed[:1] ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(ground_truth) self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.2), 'noise_prior': (0.001, 0.002) } self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=x_seed, y_seed=y_seed, gp_params=self.hyperparameters, ) self.agent.reset() plotters = { 'Safety': SafetyPlotter(self.agent, self.ground_truth) } super(HyperparametersSimulation, self).__init__( output_directory, name, plotters ) self.max_samples = max_samples self.every = every self.random_start = random_start def run(self): self.run_optim() self.run_learning() def run_optim(self): train_x, train_y = self.ground_truth.get_training_examples( n_examples=2000, from_viable=True, from_failure=False ) self.agent.fit_models(train_x, train_y, epochs=20) def run_learning(self): gamma_optim_increment = ( self.agent.gamma_cautious - self.agent.safety_model.gamma_measure ) / self.max_samples n_samples = 0 self.save_figs(prefix='0') while n_samples < self.max_samples: failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed, _ = self.agent.step() action = self.agent.last_action self.on_run_iteration( n_samples, old_state, action, new_state, reward, failed ) if n_samples >= self.max_samples: break if self.random_start: reset_state = np.atleast_1d( np.random.choice(np.linspace(0, 1.5, 100)) ) self.agent.reset(reset_state) else: reset_state = self.agent.get_random_safe_state() if reset_state is None: raise Exception('The whole measure is 0. There is no safe ' 'action.') self.agent.reset(reset_state) self.agent.safety_model.gamma_measure += gamma_optim_increment self.compile_gif() def on_run_iteration(self, n_samples, old_state, action, new_state, reward, failed): super(HyperparametersSimulation, self).on_run_iteration( old_state, action, new_state, reward, failed ) print(f'Step {n_samples}/{self.max_samples} - {old_state} ' f' -> {action} -> {new_state} ({failed})') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}')
class PenalizedSimulation(ModelLearningSimulation): def __init__(self, name, max_samples, greed, step_size, discount_rate, penalty_level, x_seed, y_seed, shape, every): dynamics_parameters = {'shape': shape} self.env = PenalizedHovership(penalty_level=penalty_level, dynamics_parameters=dynamics_parameters) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'hover_map.pickle') self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.02, 0.02), 'noise_prior': (0.001, 0.002) } self.x_seed = x_seed self.y_seed = y_seed self.agent = QLearner(self.env, greed, step_size, discount_rate, x_seed=self.x_seed, y_seed=self.y_seed, gp_params=self.hyperparameters) plotters = {'Q-Values': QValuePlotter(self.agent, self.ground_truth)} output_directory = Path(__file__).parent.resolve() super(PenalizedSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every def get_models_to_save(self): return {'q_values': self.agent.Q_model} def load_models(self, skip_local=False): model_name = list(self.get_models_to_save().keys())[0] if not skip_local: load_path = self.local_models_path / model_name else: load_path = self.models_path / model_name self.agent.value_model = GPQLearning.load(load_path, self.env.staetaction_space, self.x_seed, self.y_seed) def run(self): n_samples = 0 self.save_figs(prefix='0') while n_samples < self.max_samples: failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed = self.agent.step() action = self.agent.last_action # if n_samples > 300: # self.agent.greed *= (n_samples - 300) / (n_samples - 299) self.on_run_iteration(n_samples, old_state, action, new_state, reward, failed) if n_samples >= self.max_samples: break self.agent.reset() def on_run_iteration(self, n_samples, *args, **kwargs): super(PenalizedSimulation, self).on_run_iteration(*args, **kwargs) print(f'Iteration {n_samples}/{self.max_samples}: {self.agent.greed}') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}')
def __init__(self, name, shape, gamma_cautious, lambda_cautious, gamma_optimistic, controller, reset_in_safe_state, n_episodes_train, n_episodes_test, n_train_test, plot_every=1): shapedict = {} if shape is None else {'shape': shape} self.env = LowGoalHovership( goal_state=False, initial_state=np.array([1.3]), **shapedict # This matters for the GP ) x_seed = np.array([[2, .1]]) y_seed = np.array([.5]) lengthscale_means = (0.2, 0.2) lengthscale_vars = (0.1, 0.1) lengthscale_prior = tuple(zip(lengthscale_means, lengthscale_vars)) outputscale_prior = (1., 10.) noise_prior = (0.007, 0.1) gp_params = { 'train_x': x_seed, 'train_y': y_seed, 'outputscale_prior': outputscale_prior, 'lengthscale_prior': lengthscale_prior, 'noise_prior': noise_prior, 'mean_constant': None, 'dataset_type': None, 'dataset_params': None, # Other possible options: # 'dataset_type': 'downsampling', # 'dataset_params': {'append_every': 10}, # 'dataset_type': 'neighborerasing', # 'dataset_params': {'radius': 0.01}, 'value_structure_discount_factor': None, } if controller == 'random': agent = RandomSafetyLearner( env=self.env, s_gp_params=gp_params.copy(), gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, gamma_optimistic=gamma_optimistic, ) elif controller == 'affine': agent = AffineSafetyLearner( env=self.env, offset=(np.array([2.0]), np.array([0.1])), jacobian=np.array([[(0.7 - 0.1) / (0. - 2.)]]), s_gp_params=gp_params.copy(), gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, gamma_optimistic=gamma_optimistic, ) else: raise ValueError('Invalid controller') self.agent = agent truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / f'hover_map.pickle' self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(truth_path) ctrlr = None if controller == 'random' else self.agent.policy plotters = { 'safety': SafetyPlotter(self.agent, ground_truth=self.ground_truth, controller=ctrlr) } output_directory = Path(__file__).parent.resolve() super().__init__(output_directory, name, plotters) self.reset_in_safe_state = reset_in_safe_state self.n_episodes_train = n_episodes_train self.n_episodes_test = n_episodes_test self.n_train_test = n_train_test self.plot_every = plot_every self.training_dataset = Dataset(*Dataset.DEFAULT_COLUMNS, CTRLR_VIAB, FLWD_CTRLR, group_name=GROUP_NAME, name='train') self.testing_dataset = Dataset(*Dataset.DEFAULT_COLUMNS, SAFETY_NAME, CTRLR_VIAB, FLWD_CTRLR, group_name=GROUP_NAME, name=f'test')
class BenchmarkSingleSimulation(ModelLearningSimulation): EXP_REWARD_MNAME = 'expected_reward' EXP_FAILURE_MNAME = 'expected_failure' STD_REWARD_MNAME = 'std_reward' STD_FAILURE_MNAME = 'std_failure' Q_V_Q_C_MNAME = 'Q_V_minus_Q_cautious' Q_C_Q_V_MNAME = 'Q_cautious_minus_Q_V' METRICS_BASE_NAMES = [ EXP_REWARD_MNAME, EXP_FAILURE_MNAME, STD_REWARD_MNAME, STD_FAILURE_MNAME ] def __init__(self, output_directory, name, envname, aname, envparams, aparams, n_episodes, glie_start, safety_parameters_update_end, reset_in_safe_state, metrics_sampling_frequency, n_episodes_in_measurement, plot_every, seed): self.env = ENV_CONSTRUCTOR[envname](**envparams) self.agent = AGENT_CONSTRUCTOR[aname](env=self.env, **aparams) safety_truth_path = SAFETY_TRUTH_PATH[envname] if envname in SAFETY_TRUTH_FROM_VIBLY: self.safety_truth = SafetyTruth(self.env) self.safety_truth.from_vibly_file(safety_truth_path) else: self.safety_truth = SafetyTruth.load(safety_truth_path, self.env) self.n_episodes = n_episodes self.glie_start = glie_start if not isinstance(glie_start, float) else \ int(glie_start * self.n_episodes) if safety_parameters_update_end is not None: if isinstance(safety_parameters_update_end, float): update_end = int(safety_parameters_update_end * n_episodes) self.safety_parameters_update_end = update_end else: self.safety_parameters_update_end = safety_parameters_update_end else: self.safety_parameters_update_end = n_episodes self.reset_in_safe_state = reset_in_safe_state self.metrics_sampling_frequency = metrics_sampling_frequency self.n_episodes_in_measurement = n_episodes_in_measurement self.plot_every = plot_every self.agent_has_safety_model = aname in HAS_SAFETY_MODEL self.METRICS_NAMES = BenchmarkSingleSimulation.METRICS_BASE_NAMES if self.agent_has_safety_model: self.METRICS_NAMES += [ BenchmarkSingleSimulation.Q_C_Q_V_MNAME, BenchmarkSingleSimulation.Q_V_Q_C_MNAME ] plotters = {} if envname in PLOTTABLE_Q: if self.agent_has_safety_model: plotters.update({ 'Q-Values_Safety': QValueAndSafetyPlotter( self.agent, self.safety_truth, # ensure_in_dataset=True ) }) else: plotters.update({ 'Q-Values': QValuePlotter( self.agent, self.safety_truth, write_values=False, plot_samples=True, ) }) super(BenchmarkSingleSimulation, self).__init__(output_directory, name, plotters) self.set_seed(value=seed) self.metrics_path = self.output_directory / 'metrics' self.metrics = AgentMetrics(*self.METRICS_NAMES) simparams = { 'output_directory': output_directory, 'name': name, 'n_episodes': n_episodes, 'glie_start': glie_start, 'safety_parameters_update_end': safety_parameters_update_end, 'reset_in_safe_state': reset_in_safe_state, 'metrics_sampling_frequency': metrics_sampling_frequency, 'n_episodes_in_measurement': n_episodes_in_measurement, 'plot_every': plot_every, } logger.info(config_msg(f"Setting up simulation {name}")) logger.info(config_msg(f"ENVIRONMENT: {envname}")) logger.info(config_msg(str(envparams))) logger.info(config_msg(f"AGENT: {aname}")) logger.info(config_msg(str(aparams))) logger.info(config_msg("SIMULATION:")) logger.info(config_msg(str(simparams))) def get_models_to_save(self): if self.agent_has_safety_model: return { 'Q_model': self.agent.Q_model, 'safety_model': self.agent.safety_model } else: return { 'Q_model': self.agent.Q_model, } def load_models(self, skip_local=False): pass def get_random_safe_state(self): viable_state_indexes = np.argwhere(self.safety_truth.viability_kernel) chosen_index_among_safe = np.random.choice( viable_state_indexes.shape[0]) chosen_index = tuple(viable_state_indexes[chosen_index_among_safe]) safe_state = self.env.state_space[chosen_index] return safe_state def on_run_episode_iteration(self, *args, **kwargs): super(BenchmarkSingleSimulation, self).on_run_iteration(*args, **kwargs) def on_run_iteration(self, n_ep): if n_ep % self.plot_every == 0: self.save_figs(prefix=f'{n_ep}') def run_episode(self): episode = [] reset_state = None if not self.reset_in_safe_state else \ self.get_random_safe_state() # We don't allow initializing in failure directly, even when # reset_in_safe_state == False done = True while done: self.agent.reset(reset_state) done = self.env.done while not done: old_state = self.agent.state new_state, reward, failed = self.agent.step() done = self.env.done action = self.agent.last_action episode.append( (old_state, action, new_state, reward, failed, done)) if self.agent.training_mode: if self.agent_has_safety_model: color = None if not self.agent.updated_safety else \ FAILURE_SAMPLE_COLOR else: color = None self.on_run_episode_iteration( state=old_state, action=action, new_state=new_state, reward=reward, failed=failed, done=done, color=color, ) return episode def run(self): self.save_figs(prefix=f'init') training_episodes = [None] * self.n_episodes for n_ep in range(self.n_episodes): self.agent.training_mode = True episode = self.run_episode() training_episodes[n_ep] = episode try: total_reward = sum(list(zip(*episode))[3]) failed = 'failed' if episode[-1][4] else 'success' except IndexError: total_reward = 0 failed = 'failed' logging.info(f'Episode {n_ep}: {total_reward} reward | {failed}') msg = '\n'.join([str(epstep) for epstep in episode]) logging.info(msg) if (n_ep >= 0) and (n_ep % self.metrics_sampling_frequency == 0): self.agent.training_mode = False measurement_episodes = [None] * self.n_episodes_in_measurement for n_measurement_ep in range(self.n_episodes_in_measurement): measurement_episodes[n_measurement_ep] = self.run_episode() self.save_episodes(measurement_episodes, f'meas_{n_ep}') metrics_list = self.get_metrics(measurement_episodes) self.metrics.add_measurement(n_ep, *metrics_list) self.on_run_iteration(n_ep) if n_ep >= self.glie_start: self.agent.decrease_step_size() if self.agent_has_safety_model and \ (n_ep <= self.safety_parameters_update_end): t = (n_ep + 1) / self.safety_parameters_update_end self.agent.safety_parameters_affine_update(t) if isinstance(self.agent, SafetyQLearningSwitcher) and \ (n_ep == self.safety_parameters_update_end): self.agent.explore_safety = False self.save_episodes(training_episodes, 'training') self.metrics.save(self.metrics_path) def get_metrics(self, measurement_episodes): # measurement_episodes = np.array(measurement_episodes, dtype=float) episodes_lists = [list(zip(*ep)) for ep in measurement_episodes] rewards = [sum(ep_list[3]) for ep_list in episodes_lists] failures = [any(ep_list[4]) for ep_list in episodes_lists] # Metrics from measurements episodes exp_reward_metric = np.mean(rewards) std_reward_metric = np.std(rewards) exp_failure_metric = np.mean(failures) std_failure_metric = np.std(failures) metrics_values = [ exp_reward_metric, exp_failure_metric, std_reward_metric, std_failure_metric, ] # Metrics that don't require measurement episodes if self.agent_has_safety_model: Q_cautious = self.agent.safety_model.level_set( state=None, # Whole state-space lambda_threshold=self.agent.lambda_cautious, gamma_threshold=self.agent.gamma_cautious).astype(int) Q_V = self.safety_truth.viable_set_like( self.env.stateaction_space).astype(int) Q_cautious_Q_V = (Q_cautious - Q_V).clip(0, 1) Q_V_Q_cautious = (Q_V - Q_cautious).clip(0, 1) # The measure of the underlying sets is the mean value of each of # these arrays Q_cautious_Q_V_metric = Q_cautious_Q_V.sum() / Q_V.sum() Q_V_Q_cautious_metric = Q_V_Q_cautious.sum() / Q_V.sum() metrics_values += [ Q_cautious_Q_V_metric, Q_V_Q_cautious_metric, ] return list(zip(self.METRICS_NAMES, metrics_values)) def save_episodes(self, episodes, name): def remove_np_arrays(e): return (e[0][0], e[1][0], e[2][0], e[3], e[4], e[5]) episodes = [list(map(remove_np_arrays, ep)) for ep in episodes] episodes = [list(zip(*ep)) for ep in episodes] episodes = [{ 'states': ep[0], 'actions': ep[1], 'next_states': ep[2], 'rewards': ep[3], 'failed': ep[4], 'done': ep[5], } for ep in episodes] keys = episodes[0].keys() flattened_dict = { f'{key}_EPISODE_{n}': episodes[n][key] for key in keys for n in range(len(episodes)) } save_path = self.samples_path / name np.savez(save_path, **flattened_dict)
class SoftHardSimulation(ModelLearningSimulation): def __init__(self, name, env_name, reward_threshold, control_frequency, max_samples, max_steps, greed, step_size, discount_rate, gamma_optimistic, gamma_hard, lambda_hard, gamma_soft, q_x_seed, q_y_seed, s_x_seed, s_y_seed, optimize_hyperparameters, dataset_type, dataset_params, shape, every, glie_start, reset_in_safe_state, plotter_smoothing_window_size): parameterization = { 'env_name': env_name, 'reward_threshold': reward_threshold, 'control_frequency': control_frequency, 'max_samples': max_samples, 'greed': greed, 'step_size': step_size, 'discount_rate': discount_rate, 'gamma_optimistic': gamma_optimistic, 'gamma_hard': gamma_hard, 'lambda_hard': lambda_hard, 'gamma_soft': gamma_soft, 'q_x_seed': q_x_seed, 'q_y_seed': q_y_seed, 's_x_seed': s_x_seed, 's_y_seed': s_y_seed, 'optimize_hyperparameters': optimize_hyperparameters, 'dataset_type': dataset_type, 'dataset_params': dataset_params, 'shape': shape, 'every': every, 'glie_start': glie_start, 'reset_in_safe_state': reset_in_safe_state, 'plotter_smoothing_window_size': plotter_smoothing_window_size } dynamics_parameters = {'shape': shape} if env_name == 'slip': self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters, reward_done_threshold=reward_threshold) elif env_name == 'hovership': self.env = LowGoalHovership( dynamics_parameters=dynamics_parameters, reward_done_threshold=reward_threshold) elif env_name == 'cartpole': self.env = CartPole(discretization_shape=shape, control_frequency=control_frequency) elif env_name == 'lander': self.env = LunarLander(discretization_shape=shape) self.q_hyperparameters = { 'outputscale_prior': (0.12, 0.01), 'lengthscale_prior': (0.15, 0.05), 'noise_prior': (0.001, 0.002), 'dataset_type': dataset_type, 'dataset_params': dataset_params, } self.s_hyperparameters = { 'outputscale_prior': (0.12, 0.01), 'lengthscale_prior': (0.15, 0.05), 'noise_prior': (0.001, 0.002), 'dataset_type': dataset_type, 'dataset_params': dataset_params, } self.q_x_seed = q_x_seed self.q_y_seed = q_y_seed self.s_x_seed = s_x_seed self.s_y_seed = s_y_seed self.optimize_hyperparameters = optimize_hyperparameters self.gamma_optimistic_start, self.gamma_optimistic_end = identity_or_duplicated_value( gamma_optimistic) self.gamma_hard_start, self.gamma_hard_end = identity_or_duplicated_value( gamma_hard) self.lambda_hard_start, self.lambda_hard_end = identity_or_duplicated_value( lambda_hard) self.gamma_soft_start, self.gamma_soft_end = identity_or_duplicated_value( gamma_soft) self.gamma_optimistic = self.gamma_optimistic_start self.gamma_hard = self.gamma_hard_start self.gamma_soft = self.gamma_soft_start self.lambda_hard = self.lambda_hard_start self.agent = SoftHardLearner( self.env, greed=greed, step_size=step_size, discount_rate=discount_rate, q_x_seed=self.q_x_seed, q_y_seed=self.q_y_seed, gamma_optimistic=self.gamma_optimistic, gamma_hard=self.gamma_hard, lambda_hard=self.lambda_hard, gamma_soft=self.gamma_soft, s_x_seed=s_x_seed, s_y_seed=s_y_seed, q_gp_params=self.q_hyperparameters, s_gp_params=self.s_hyperparameters, ) if env_name == 'slip': truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / 'slip_map.pickle' elif env_name == 'hovership': truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / 'hover_map.pickle' else: truth_path = None if truth_path is not None: self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(truth_path) else: self.ground_truth = None plottable_Q = ['slip', 'hovership'] if env_name in plottable_Q: plotters = { 'Q-Values_Safety': SoftHardPlotter(self.agent, self.ground_truth, ensure_in_dataset=True) } else: plotters = {} plotters.update({ 'RewardFailure': RewardFailurePlotter(agents_names=['Soft-hard'], window_size=plotter_smoothing_window_size, padding_value=1) }) output_directory = Path(__file__).parent.resolve() super(SoftHardSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.max_steps = max_steps self.every = every if isinstance(glie_start, float): self.glie_start = int(glie_start * self.max_samples) else: self.glie_start = glie_start self.reset_in_safe_state = reset_in_safe_state msg = '' for pname, pval in parameterization.items(): msg += pname + ' = ' + str(pval) + ', ' msg = msg[:-2] logging.info(config_msg(f'Simulation started with parameters: {msg}')) def get_models_to_save(self): # The keys must be the same as the actual names of the attributes, this is used in load_models. # This is hacky and should be replaced return { 'Q_model': self.agent.Q_model, 'safety_model': self.agent.safety_model } def load_models(self, skip_local=False): from edge.model.safety_models import MaternSafety from edge.model.value_models import GPQLearning models_names = list(self.get_models_to_save().keys()) loaders = { 'Q_model': lambda mpath: GPQLearning(mpath, self.env, self.q_x_seed, self. q_y_seed), 'safety_model': lambda mpath: MaternSafety(mpath, self.env, self.gamma_optimistic, self.s_x_seed, self.s_y_seed), } for mname in models_names: if not skip_local: load_path = self.local_models_path / mname else: load_path = self.models_path / mname setattr(self.agent, mname, loaders[mname](load_path)) def run(self): n_samples = 0 self.save_figs(prefix='0') if self.optimize_hyperparameters: logging.info('Optimizing hyperparameters...') s_train_x, s_train_y = self.ground_truth.get_training_examples() self.agent.fit_models(s_epochs=50, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.1}) self.agent.fit_models(s_epochs=50, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.01}) self.agent.fit_models(s_epochs=50, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.001}) logging.info('Done.') else: logging.info('Hyperparameters were NOT optimized.') logging.info( config_msg( 'Lengthscale:' f'{self.agent.safety_model.gp.covar_module.base_kernel.lengthscale}' )) logging.info( config_msg( 'Outputscale:' f'{self.agent.safety_model.gp.covar_module.outputscale}')) logging.info('Training...') while n_samples < self.max_samples: if self.reset_in_safe_state: reset_state = self.agent.get_random_safe_state() else: reset_state = None self.agent.reset(reset_state) failed = self.agent.failed done = self.env.done n_steps = 0 while not done and n_steps < self.max_steps: n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed, done = self.agent.step() action = self.agent.last_action # * start reducing step size so Q-Learning converges if self.glie_start is not None and n_samples > self.glie_start: self.agent.step_size *= (n_samples - self.glie_start) / ( (n_samples - self.glie_start + 1)) self.agent.gamma_optimistic = affine_interpolation( n_samples / self.max_samples, self.gamma_optimistic_start, self.gamma_optimistic_end) self.agent.gamma_hard = affine_interpolation( n_samples / self.max_samples, self.gamma_hard_start, self.gamma_hard_end) self.agent.lambda_hard = affine_interpolation( n_samples / self.max_samples, self.lambda_hard_start, self.lambda_hard_end) self.agent.gamma_soft = affine_interpolation( n_samples / self.max_samples, self.gamma_soft_start, self.gamma_soft_end) color = None if not self.agent.updated_safety else [ 0.3, 0.3, 0.9 ] self.on_run_iteration(n_samples=n_samples, state=old_state, action=action, new_state=new_state, reward=reward, failed=failed, done=done, color=color, aname='Soft-hard') if n_samples >= self.max_samples: break logging.info('Done.') self.save_figs(prefix=f'{self.name}_final') self.compile_gif() def on_run_iteration(self, n_samples, *args, **kwargs): super(SoftHardSimulation, self).on_run_iteration(*args, **kwargs) logging.info(f'Iteration {n_samples}/{self.max_samples}') logging.info(f'# of Q-values training examples: ' f'{len(self.agent.Q_model.gp.train_x)}') logging.info(f'# of safety measure training examples: ' f'{len(self.agent.safety_model.gp.train_x)}') if kwargs['failed']: logging.info('Failed!') elif kwargs['done']: logging.info('Solved!') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}') self.env.render()
class OptimisticSimulation(Simulation): def __init__(self, max_samples, gamma_optimistic, gamma_cautious, lambda_cautious, shape, every): self.x_seed = np.array([1.45, 0.5]) self.y_seed = np.array([.8]) dynamics_parameters = { 'shape': shape } self.env = Hovership( random_start=True, dynamics_parameters=dynamics_parameters, default_initial_state=self.x_seed[:1] ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( '../data/ground_truth/from_vibly/hover_map.pickle' ) self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.1, 0.1), 'noise_prior': (0.001, 0.002) } self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=self.x_seed, y_seed=self.y_seed, gp_params=self.hyperparameters, ) plotters = { 'DetailedSafety': DetailedSafetyPlotter(self.agent, self.ground_truth) } super(OptimisticSimulation, self).__init__( 'results', 'optimistic', plotters ) self.max_samples = max_samples self.every = every self.samples_path = self.output_directory / 'samples' self.samples_path.mkdir(parents=True, exist_ok=True) self.model_path = self.output_directory / 'model' self.model_path.mkdir(parents=True, exist_ok=True) failure_indexes = np.argwhere(self.ground_truth.failure_set == 1) self.failure_set = np.array([ self.ground_truth.stateaction_space[tuple(index)] for index in failure_indexes[::3] ]) def run_optim(self): train_x, train_y = self.ground_truth.get_training_examples( n_examples=2000, from_viable=True, from_failure=False ) self.agent.fit_models(train_x, train_y, epochs=20) def save_samples(self, name): self.agent.safety_model.save_samples(str(self.samples_path / name)) def load_samples(self, name): self.agent.safety_model.load_samples(str(self.samples_path / name)) def save_model(self): self.agent.safety_model.save(str(self.model_path)) def load_model(self): self.agent.safety_model = MaternSafety.load(str(self.model_path), self.env, self.agent.safety_model.gamma_measure, self.x_seed, self.y_seed ) def check_failure_set(self): model = self.agent.safety_model measure_slice, covar_slice = model._query( self.failure_set, return_covar=True) level_value = norm.cdf( (measure_slice - 0) / np.sqrt(covar_slice) ) failure_levels = level_value > model.gamma_measure if failure_levels.any(): print('Nonzero value in the failure set !') def run_learning(self): n_samples = 0 self.save_figs(prefix='0') while n_samples < self.max_samples: failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: #self.check_failure_set() n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed, _ = self.agent.step() action = self.agent.last_action self.on_run_iteration( n_samples, old_state, action, new_state, reward, failed ) if n_samples >= self.max_samples: break reset_state = self.agent.get_random_safe_state() if reset_state is None: raise Exception('The whole measure is 0. There is no safe ' 'action.') self.agent.reset(reset_state) def on_run_iteration(self, n_samples, old_state, action, new_state, reward, failed): super(OptimisticSimulation, self).on_run_iteration( old_state, action, new_state, reward, failed ) print(f'Step {n_samples}/{self.max_samples} - {old_state} ' f' -> {action} -> {new_state} ({failed})') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}')
rc('text', usetex=True) plt.rc('text', usetex=True) plt.rc('font', family='serif') VIBLY_DATA_PATH = Path('../../data/ground_truth/from_vibly') hover_path = VIBLY_DATA_PATH / 'hover_map.pickle' slip_path = VIBLY_DATA_PATH / 'slip_map.pickle' output_path = Path('.') / 'state_action_spaces' output_path.mkdir(exist_ok=True) for envname, envconstr, param, tpath in [('hovership', LowGoalHovership, LOW_GOAL_HOVERSHIP_PARAMS, hover_path), ('slip', LowGoalSlip, LOW_GOAL_SLIP_PARAMS, slip_path)]: env = envconstr(**param) truth = SafetyTruth(env) truth.from_vibly_file(tpath) subplotter = SafetyTruthSubplotter(truth, corl_colors) figure = plt.figure(constrained_layout=True, figsize=(5.5, 4.8)) # gs = figure.add_gridspec(1, 2, width_ratios=[3, 1]) ax_Q = figure.add_subplot() subplotter.draw_on_axs(ax_Q, None) ax_Q.tick_params(direction='in', top=True, right=True) # ax_S.tick_params(direction='in', left=False) ax_Q.set_xlabel(r'action space $A$') ax_Q.set_ylabel(r'state space $S$') # ax_S.set_xlabel(r'$\Lambda$')
def __init__(self, name, env_name, reward_threshold, control_frequency, max_samples, max_steps, greed, step_size, discount_rate, gamma_optimistic, gamma_hard, lambda_hard, gamma_soft, q_x_seed, q_y_seed, s_x_seed, s_y_seed, optimize_hyperparameters, dataset_type, dataset_params, shape, every, glie_start, reset_in_safe_state, plotter_smoothing_window_size): parameterization = { 'env_name': env_name, 'reward_threshold': reward_threshold, 'control_frequency': control_frequency, 'max_samples': max_samples, 'greed': greed, 'step_size': step_size, 'discount_rate': discount_rate, 'gamma_optimistic': gamma_optimistic, 'gamma_hard': gamma_hard, 'lambda_hard': lambda_hard, 'gamma_soft': gamma_soft, 'q_x_seed': q_x_seed, 'q_y_seed': q_y_seed, 's_x_seed': s_x_seed, 's_y_seed': s_y_seed, 'optimize_hyperparameters': optimize_hyperparameters, 'dataset_type': dataset_type, 'dataset_params': dataset_params, 'shape': shape, 'every': every, 'glie_start': glie_start, 'reset_in_safe_state': reset_in_safe_state, 'plotter_smoothing_window_size': plotter_smoothing_window_size } dynamics_parameters = {'shape': shape} if env_name == 'slip': self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters, reward_done_threshold=reward_threshold) elif env_name == 'hovership': self.env = LowGoalHovership( dynamics_parameters=dynamics_parameters, reward_done_threshold=reward_threshold) elif env_name == 'cartpole': self.env = CartPole(discretization_shape=shape, control_frequency=control_frequency) elif env_name == 'lander': self.env = LunarLander(discretization_shape=shape) self.q_hyperparameters = { 'outputscale_prior': (0.12, 0.01), 'lengthscale_prior': (0.15, 0.05), 'noise_prior': (0.001, 0.002), 'dataset_type': dataset_type, 'dataset_params': dataset_params, } self.s_hyperparameters = { 'outputscale_prior': (0.12, 0.01), 'lengthscale_prior': (0.15, 0.05), 'noise_prior': (0.001, 0.002), 'dataset_type': dataset_type, 'dataset_params': dataset_params, } self.q_x_seed = q_x_seed self.q_y_seed = q_y_seed self.s_x_seed = s_x_seed self.s_y_seed = s_y_seed self.optimize_hyperparameters = optimize_hyperparameters self.gamma_optimistic_start, self.gamma_optimistic_end = identity_or_duplicated_value( gamma_optimistic) self.gamma_hard_start, self.gamma_hard_end = identity_or_duplicated_value( gamma_hard) self.lambda_hard_start, self.lambda_hard_end = identity_or_duplicated_value( lambda_hard) self.gamma_soft_start, self.gamma_soft_end = identity_or_duplicated_value( gamma_soft) self.gamma_optimistic = self.gamma_optimistic_start self.gamma_hard = self.gamma_hard_start self.gamma_soft = self.gamma_soft_start self.lambda_hard = self.lambda_hard_start self.agent = SoftHardLearner( self.env, greed=greed, step_size=step_size, discount_rate=discount_rate, q_x_seed=self.q_x_seed, q_y_seed=self.q_y_seed, gamma_optimistic=self.gamma_optimistic, gamma_hard=self.gamma_hard, lambda_hard=self.lambda_hard, gamma_soft=self.gamma_soft, s_x_seed=s_x_seed, s_y_seed=s_y_seed, q_gp_params=self.q_hyperparameters, s_gp_params=self.s_hyperparameters, ) if env_name == 'slip': truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / 'slip_map.pickle' elif env_name == 'hovership': truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / 'hover_map.pickle' else: truth_path = None if truth_path is not None: self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(truth_path) else: self.ground_truth = None plottable_Q = ['slip', 'hovership'] if env_name in plottable_Q: plotters = { 'Q-Values_Safety': SoftHardPlotter(self.agent, self.ground_truth, ensure_in_dataset=True) } else: plotters = {} plotters.update({ 'RewardFailure': RewardFailurePlotter(agents_names=['Soft-hard'], window_size=plotter_smoothing_window_size, padding_value=1) }) output_directory = Path(__file__).parent.resolve() super(SoftHardSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.max_steps = max_steps self.every = every if isinstance(glie_start, float): self.glie_start = int(glie_start * self.max_samples) else: self.glie_start = glie_start self.reset_in_safe_state = reset_in_safe_state msg = '' for pname, pval in parameterization.items(): msg += pname + ' = ' + str(pval) + ', ' msg = msg[:-2] logging.info(config_msg(f'Simulation started with parameters: {msg}'))