Esempio n. 1
0
 def get_nn(self):
     config = get_PPO_config(1234, 0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     nn = sequential_nn
     return nn
Esempio n. 2
0
 def get_nn(self):
     from training.ppo.tune.tune_train_PPO_inverted_pendulum import get_PPO_config
     config = get_PPO_config(1234, 0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     nn = sequential_nn
     return nn
Esempio n. 3
0
 def __init__(self, path, size=(500, 100), seed=1234, traces=True):
     self.size = size
     config = get_PPO_config(1234, use_gpu=0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     config = {"cost_fn": 1, "simplified": True}
     self.env = StoppingCar(config)
     self.env.seed(seed)
     load_dataset = True
     file_name = "dataset_new.p"
     if load_dataset and traces and os.path.exists(file_name):
         dataset = pickle.load(open(file_name, "rb"))
     else:
         dataset = []
         while len(dataset) < size[0]:
             state_np = self.env.reset()  # only starting states
             state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:]  # pick just delta_x and delta_v
             action = torch.argmax(sequential_nn(state_reduced)).item()
             next_state_np, reward, done, _ = self.env.step(action)
             dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 1))
         param_grid = {'delta_v': np.arange(-30, 30, 0.5), 'delta_x': np.arange(-10, 40, 0.5)}
         for parameters in ParameterGrid(param_grid):
             delta_v = parameters["delta_v"]
             delta_x = parameters["delta_x"]
             self.env.reset()
             self.env.x_lead = delta_x
             self.env.x_ego = 0
             self.env.v_lead = delta_v
             self.env.v_ego = 0
             done = False
             temp_dataset = []
             state_np = np.array([delta_v, delta_x])
             state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:]  # pick just delta_x and delta_v
             for i in (range(100) if traces else range(1)):
                 # action = torch.argmax(sequential_nn(state_reduced)).item()
                 action = self.env.perfect_action()
                 next_state_np, reward, done, _ = self.env.step(action)
                 temp_dataset.append((state_np, next_state_np))
                 state_np = next_state_np
                 if next_state_np[1] < 0.5 and not done:
                     done = True
                 if done is True:  # only unsafe states
                     break
             if done:
                 for state_np, next_state_np in temp_dataset:
                     dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), -1))
             else:
                 for state_np, next_state_np in temp_dataset:
                     dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 0))
         if traces:
             pickle.dump(dataset, open(file_name, "wb+"))
     self.dataset = dataset
Esempio n. 4
0
 def get_nn(self):
     config = get_PPO_config(1234)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     layers = []
     for l in sequential_nn:
         layers.append(l)
     nn = torch.nn.Sequential(*layers)
     return nn
Esempio n. 5
0
 def get_nn_old(self):
     config, trainer = get_PPO_trainer(use_gpu=0)
     trainer.restore(
         "/home/edoardo/ray_results/PPO_BouncingBall_2021-01-04_18-58-32smp2ln1g/checkpoint_272/checkpoint-272"
     )
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     layers = []
     for l in sequential_nn:
         layers.append(l)
     nn = torch.nn.Sequential(*layers)
     return nn
Esempio n. 6
0
 def get_nn_old(self):
     config, trainer = get_PPO_trainer(use_gpu=0)
     trainer.restore("/home/edoardo/ray_results/PPO_StoppingCar_2020-12-30_17-06-3265yz3d63/checkpoint_65/checkpoint-65")
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     # l0 = torch.nn.Linear(6, 2, bias=False)
     # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32))
     # layers = [l0]
     # for l in sequential_nn:
     #     layers.append(l)
     #
     # nn = torch.nn.Sequential(*layers)
     nn = sequential_nn
     # ray.shutdown()
     return nn
Esempio n. 7
0
 def get_nn(self):
     pickled_path = self.nn_path + ".pickle"
     if os.path.exists(pickled_path):
         nn = torch.load(pickled_path, map_location=torch.device('cpu'))
         return nn
     config = get_PPO_config(1234, use_gpu=0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     layers = []
     for l in sequential_nn:
         layers.append(l)
     nn = torch.nn.Sequential(*layers)
     torch.save(nn, pickled_path)
     return nn
Esempio n. 8
0
    def get_nn(self):
        config = get_PPO_config(1234)
        trainer = ppo.PPOTrainer(config=config)
        trainer.restore(self.nn_path)

        policy = trainer.get_policy()
        # sequential_nn = convert_ray_simple_policy_to_sequential(policy).cpu()
        sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
        # l0 = torch.nn.Linear(5, 3, bias=False)
        # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]], dtype=torch.float32))
        layers = []
        for l in sequential_nn:
            layers.append(l)
        nn = torch.nn.Sequential(*layers)
        # ray.shutdown()
        return nn
Esempio n. 9
0
 def get_nn(self):
     config = get_PPO_config(1234, use_gpu=0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     # l0 = torch.nn.Linear(6, 2, bias=False)
     # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32))
     # layers = [l0]
     # for l in sequential_nn:
     #     layers.append(l)
     #
     # nn = torch.nn.Sequential(*layers)
     nn = sequential_nn
     # ray.shutdown()
     return nn
Esempio n. 10
0
 def get_nn(self):
     pickled_path = self.nn_path + ".pickle"
     if os.path.exists(pickled_path):
         nn = torch.load(pickled_path, map_location=torch.device('cpu'))
         return nn
     config = get_PPO_config(1234, 0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     # l0 = torch.nn.Linear(6, 2, bias=False)
     # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32))
     # layers = [l0]
     # for l in sequential_nn:
     #     layers.append(l)
     #
     # nn = torch.nn.Sequential(*layers)
     nn = sequential_nn
     torch.save(nn, pickled_path)
     # ray.shutdown()
     return nn
Esempio n. 11
0
    def setup(self, config):
        path1 = config["path"]
        path_invariant = config["path_invariant"]
        batch_size = config["batch_size"]
        train_data = GridSearchDataset()
        val_data = GridSearchDataset()
        train_loader = DataLoader(train_data,
                                  batch_size=batch_size,
                                  shuffle=True)
        val_loader = DataLoader(val_data, batch_size=batch_size)

        invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(50, 1),
                                              torch.nn.Tanh())
        invariant_model.load_state_dict(
            torch.load(
                path_invariant,
                map_location=torch.device('cpu')))  # load the invariant model
        invariant_model.cuda()
        config = get_PPO_config(1234)
        trainer = ppo.PPOTrainer(config=config)
        trainer.restore(path1)
        policy = trainer.get_policy()
        sequential_nn = convert_ray_policy_to_sequential(
            policy)  # load the agent model
        sequential_nn.cuda()

        model = sequential_nn
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.get("lr", 1e-3))
        loss = RetrainLoss(invariant_model)  # torch.nn.MSELoss()

        self.models, self.optimizer, self.criterion = self.register(
            models=[model, invariant_model],
            optimizers=optimizer,
            criterion=loss)
        self.model = self.models[0]
        self.register_data(train_loader=train_loader,
                           validation_loader=val_loader)
Esempio n. 12
0
from ray.rllib.agents.ppo import ppo

from environment.collision_avoidance import ColAvoidEnvDiscrete
from training.ppo.tune.tune_train_PPO_collision_avoidance import get_PPO_config
from training.ray_utils import convert_ray_policy_to_sequential

ray.init()
# register_env("fishing", env_creator)
config = get_PPO_config(1234)
trainer = ppo.PPOTrainer(config=config)
# trainer.restore("/home/edoardo/ray_results/tune_PPO_lunar_hover/PPO_LunarHover_7ba4e_00000_0_2021-04-02_19-01-43/checkpoint_990/checkpoint-990")
trainer.restore("/home/edoardo/ray_results/tune_PPO_collision_avoidance/PPO_ColAvoidEnvDiscrete_12944_00000_0_2021-04-26_15-24-12/checkpoint_160/checkpoint-160")

policy = trainer.get_policy()
# sequential_nn = convert_ray_simple_policy_to_sequential(policy).cpu()
sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
# l0 = torch.nn.Linear(4, 2, bias=False)
# l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, 0], [0, 0, 0, 1]], dtype=torch.float32))
# layers = [l0]
# for l in sequential_nn:
#     layers.append(l)
# nn = torch.nn.Sequential(*layers)
nn = sequential_nn
env = ColAvoidEnvDiscrete()
# env.render()
plot_index = 0
position_list = []
# env.render()
n_trials = 10
cumulative_reward = 0
# clock = pygame.time.Clock()
Esempio n. 13
0

if __name__ == '__main__':

    ray.init(local_mode=True)
    path1 = os.path.join(
        utils.get_save_dir(),
        "tune_PPO_stopping_car/PPO_StoppingCar_acc24_00001_1_cost_fn=0,epsilon_input=0_2021-01-21_02-30-49/checkpoint_58/checkpoint-58"
    )
    path_invariant = os.path.join(utils.get_save_dir(),
                                  "invariant_checkpoint_old.pt")
    config = get_PPO_config(1234, use_gpu=0)
    trainer = ppo.PPOTrainer(config=config)
    trainer.restore(path1)
    policy = trainer.get_policy()
    old_agent_model = convert_ray_policy_to_sequential(policy).cpu()

    enable_training = True
    if enable_training:
        trainer1 = TorchTrainer(
            training_operator_cls=SafetyRetrainingOperator,
            num_workers=1,
            use_gpu=True,
            config={
                "lr": 1e-2,  # used in optimizer_creator
                "hidden_size": 1,  # used in model_creator
                "batch_size": 1024,  # used in data_creator
                "path": path1,  # path to load the agent nn
                "path_invariant":
                path_invariant,  # the path to the invariant network
            },