def get_nn(self): config = get_PPO_config(1234, 0) trainer = ppo.PPOTrainer(config=config) trainer.restore(self.nn_path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() nn = sequential_nn return nn
def get_nn(self): from training.ppo.tune.tune_train_PPO_inverted_pendulum import get_PPO_config config = get_PPO_config(1234, 0) trainer = ppo.PPOTrainer(config=config) trainer.restore(self.nn_path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() nn = sequential_nn return nn
def __init__(self, path, size=(500, 100), seed=1234, traces=True): self.size = size config = get_PPO_config(1234, use_gpu=0) trainer = ppo.PPOTrainer(config=config) trainer.restore(path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() config = {"cost_fn": 1, "simplified": True} self.env = StoppingCar(config) self.env.seed(seed) load_dataset = True file_name = "dataset_new.p" if load_dataset and traces and os.path.exists(file_name): dataset = pickle.load(open(file_name, "rb")) else: dataset = [] while len(dataset) < size[0]: state_np = self.env.reset() # only starting states state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:] # pick just delta_x and delta_v action = torch.argmax(sequential_nn(state_reduced)).item() next_state_np, reward, done, _ = self.env.step(action) dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 1)) param_grid = {'delta_v': np.arange(-30, 30, 0.5), 'delta_x': np.arange(-10, 40, 0.5)} for parameters in ParameterGrid(param_grid): delta_v = parameters["delta_v"] delta_x = parameters["delta_x"] self.env.reset() self.env.x_lead = delta_x self.env.x_ego = 0 self.env.v_lead = delta_v self.env.v_ego = 0 done = False temp_dataset = [] state_np = np.array([delta_v, delta_x]) state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:] # pick just delta_x and delta_v for i in (range(100) if traces else range(1)): # action = torch.argmax(sequential_nn(state_reduced)).item() action = self.env.perfect_action() next_state_np, reward, done, _ = self.env.step(action) temp_dataset.append((state_np, next_state_np)) state_np = next_state_np if next_state_np[1] < 0.5 and not done: done = True if done is True: # only unsafe states break if done: for state_np, next_state_np in temp_dataset: dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), -1)) else: for state_np, next_state_np in temp_dataset: dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 0)) if traces: pickle.dump(dataset, open(file_name, "wb+")) self.dataset = dataset
def get_nn(self): config = get_PPO_config(1234) trainer = ppo.PPOTrainer(config=config) trainer.restore(self.nn_path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() layers = [] for l in sequential_nn: layers.append(l) nn = torch.nn.Sequential(*layers) return nn
def get_nn_old(self): config, trainer = get_PPO_trainer(use_gpu=0) trainer.restore( "/home/edoardo/ray_results/PPO_BouncingBall_2021-01-04_18-58-32smp2ln1g/checkpoint_272/checkpoint-272" ) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() layers = [] for l in sequential_nn: layers.append(l) nn = torch.nn.Sequential(*layers) return nn
def get_nn_old(self): config, trainer = get_PPO_trainer(use_gpu=0) trainer.restore("/home/edoardo/ray_results/PPO_StoppingCar_2020-12-30_17-06-3265yz3d63/checkpoint_65/checkpoint-65") policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() # l0 = torch.nn.Linear(6, 2, bias=False) # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32)) # layers = [l0] # for l in sequential_nn: # layers.append(l) # # nn = torch.nn.Sequential(*layers) nn = sequential_nn # ray.shutdown() return nn
def get_nn(self): pickled_path = self.nn_path + ".pickle" if os.path.exists(pickled_path): nn = torch.load(pickled_path, map_location=torch.device('cpu')) return nn config = get_PPO_config(1234, use_gpu=0) trainer = ppo.PPOTrainer(config=config) trainer.restore(self.nn_path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() layers = [] for l in sequential_nn: layers.append(l) nn = torch.nn.Sequential(*layers) torch.save(nn, pickled_path) return nn
def get_nn(self): config = get_PPO_config(1234) trainer = ppo.PPOTrainer(config=config) trainer.restore(self.nn_path) policy = trainer.get_policy() # sequential_nn = convert_ray_simple_policy_to_sequential(policy).cpu() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() # l0 = torch.nn.Linear(5, 3, bias=False) # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]], dtype=torch.float32)) layers = [] for l in sequential_nn: layers.append(l) nn = torch.nn.Sequential(*layers) # ray.shutdown() return nn
def get_nn(self): config = get_PPO_config(1234, use_gpu=0) trainer = ppo.PPOTrainer(config=config) trainer.restore(self.nn_path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() # l0 = torch.nn.Linear(6, 2, bias=False) # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32)) # layers = [l0] # for l in sequential_nn: # layers.append(l) # # nn = torch.nn.Sequential(*layers) nn = sequential_nn # ray.shutdown() return nn
def get_nn(self): pickled_path = self.nn_path + ".pickle" if os.path.exists(pickled_path): nn = torch.load(pickled_path, map_location=torch.device('cpu')) return nn config = get_PPO_config(1234, 0) trainer = ppo.PPOTrainer(config=config) trainer.restore(self.nn_path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() # l0 = torch.nn.Linear(6, 2, bias=False) # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32)) # layers = [l0] # for l in sequential_nn: # layers.append(l) # # nn = torch.nn.Sequential(*layers) nn = sequential_nn torch.save(nn, pickled_path) # ray.shutdown() return nn
def setup(self, config): path1 = config["path"] path_invariant = config["path_invariant"] batch_size = config["batch_size"] train_data = GridSearchDataset() val_data = GridSearchDataset() train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_data, batch_size=batch_size) invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh()) invariant_model.load_state_dict( torch.load( path_invariant, map_location=torch.device('cpu'))) # load the invariant model invariant_model.cuda() config = get_PPO_config(1234) trainer = ppo.PPOTrainer(config=config) trainer.restore(path1) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential( policy) # load the agent model sequential_nn.cuda() model = sequential_nn optimizer = torch.optim.Adam(model.parameters(), lr=config.get("lr", 1e-3)) loss = RetrainLoss(invariant_model) # torch.nn.MSELoss() self.models, self.optimizer, self.criterion = self.register( models=[model, invariant_model], optimizers=optimizer, criterion=loss) self.model = self.models[0] self.register_data(train_loader=train_loader, validation_loader=val_loader)
from ray.rllib.agents.ppo import ppo from environment.collision_avoidance import ColAvoidEnvDiscrete from training.ppo.tune.tune_train_PPO_collision_avoidance import get_PPO_config from training.ray_utils import convert_ray_policy_to_sequential ray.init() # register_env("fishing", env_creator) config = get_PPO_config(1234) trainer = ppo.PPOTrainer(config=config) # trainer.restore("/home/edoardo/ray_results/tune_PPO_lunar_hover/PPO_LunarHover_7ba4e_00000_0_2021-04-02_19-01-43/checkpoint_990/checkpoint-990") trainer.restore("/home/edoardo/ray_results/tune_PPO_collision_avoidance/PPO_ColAvoidEnvDiscrete_12944_00000_0_2021-04-26_15-24-12/checkpoint_160/checkpoint-160") policy = trainer.get_policy() # sequential_nn = convert_ray_simple_policy_to_sequential(policy).cpu() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() # l0 = torch.nn.Linear(4, 2, bias=False) # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, 0], [0, 0, 0, 1]], dtype=torch.float32)) # layers = [l0] # for l in sequential_nn: # layers.append(l) # nn = torch.nn.Sequential(*layers) nn = sequential_nn env = ColAvoidEnvDiscrete() # env.render() plot_index = 0 position_list = [] # env.render() n_trials = 10 cumulative_reward = 0 # clock = pygame.time.Clock()
if __name__ == '__main__': ray.init(local_mode=True) path1 = os.path.join( utils.get_save_dir(), "tune_PPO_stopping_car/PPO_StoppingCar_acc24_00001_1_cost_fn=0,epsilon_input=0_2021-01-21_02-30-49/checkpoint_58/checkpoint-58" ) path_invariant = os.path.join(utils.get_save_dir(), "invariant_checkpoint_old.pt") config = get_PPO_config(1234, use_gpu=0) trainer = ppo.PPOTrainer(config=config) trainer.restore(path1) policy = trainer.get_policy() old_agent_model = convert_ray_policy_to_sequential(policy).cpu() enable_training = True if enable_training: trainer1 = TorchTrainer( training_operator_cls=SafetyRetrainingOperator, num_workers=1, use_gpu=True, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 1024, # used in data_creator "path": path1, # path to load the agent nn "path_invariant": path_invariant, # the path to the invariant network },