Example #1
0
 def get_nn(self):
     config = get_PPO_config(1234, 0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     nn = sequential_nn
     return nn
Example #2
0
 def get_nn(self):
     from training.ppo.tune.tune_train_PPO_inverted_pendulum import get_PPO_config
     config = get_PPO_config(1234, 0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     nn = sequential_nn
     return nn
Example #3
0
 def __init__(self, path, size=(500, 100), seed=1234, traces=True):
     self.size = size
     config = get_PPO_config(1234, use_gpu=0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     config = {"cost_fn": 1, "simplified": True}
     self.env = StoppingCar(config)
     self.env.seed(seed)
     load_dataset = True
     file_name = "dataset_new.p"
     if load_dataset and traces and os.path.exists(file_name):
         dataset = pickle.load(open(file_name, "rb"))
     else:
         dataset = []
         while len(dataset) < size[0]:
             state_np = self.env.reset()  # only starting states
             state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:]  # pick just delta_x and delta_v
             action = torch.argmax(sequential_nn(state_reduced)).item()
             next_state_np, reward, done, _ = self.env.step(action)
             dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 1))
         param_grid = {'delta_v': np.arange(-30, 30, 0.5), 'delta_x': np.arange(-10, 40, 0.5)}
         for parameters in ParameterGrid(param_grid):
             delta_v = parameters["delta_v"]
             delta_x = parameters["delta_x"]
             self.env.reset()
             self.env.x_lead = delta_x
             self.env.x_ego = 0
             self.env.v_lead = delta_v
             self.env.v_ego = 0
             done = False
             temp_dataset = []
             state_np = np.array([delta_v, delta_x])
             state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:]  # pick just delta_x and delta_v
             for i in (range(100) if traces else range(1)):
                 # action = torch.argmax(sequential_nn(state_reduced)).item()
                 action = self.env.perfect_action()
                 next_state_np, reward, done, _ = self.env.step(action)
                 temp_dataset.append((state_np, next_state_np))
                 state_np = next_state_np
                 if next_state_np[1] < 0.5 and not done:
                     done = True
                 if done is True:  # only unsafe states
                     break
             if done:
                 for state_np, next_state_np in temp_dataset:
                     dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), -1))
             else:
                 for state_np, next_state_np in temp_dataset:
                     dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 0))
         if traces:
             pickle.dump(dataset, open(file_name, "wb+"))
     self.dataset = dataset
Example #4
0
 def get_nn(self):
     config = get_PPO_config(1234, use_gpu=0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     # l0 = torch.nn.Linear(6, 2, bias=False)
     # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32))
     # layers = [l0]
     # for l in sequential_nn:
     #     layers.append(l)
     #
     # nn = torch.nn.Sequential(*layers)
     nn = sequential_nn
     # ray.shutdown()
     return nn
Example #5
0
 def get_nn(self):
     pickled_path = self.nn_path + ".pickle"
     if os.path.exists(pickled_path):
         nn = torch.load(pickled_path, map_location=torch.device('cpu'))
         return nn
     config = get_PPO_config(1234, 0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(self.nn_path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     # l0 = torch.nn.Linear(6, 2, bias=False)
     # l0.weight = torch.nn.Parameter(torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32))
     # layers = [l0]
     # for l in sequential_nn:
     #     layers.append(l)
     #
     # nn = torch.nn.Sequential(*layers)
     nn = sequential_nn
     torch.save(nn, pickled_path)
     # ray.shutdown()
     return nn
Example #6
0
    def setup(self, config):
        path1 = config["path"]
        path_invariant = config["path_invariant"]
        batch_size = config["batch_size"]
        train_data = GridSearchDataset()
        val_data = GridSearchDataset()
        train_loader = DataLoader(train_data,
                                  batch_size=batch_size,
                                  shuffle=True)
        val_loader = DataLoader(val_data, batch_size=batch_size)

        invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(50, 1),
                                              torch.nn.Tanh())
        invariant_model.load_state_dict(
            torch.load(
                path_invariant,
                map_location=torch.device('cpu')))  # load the invariant model
        invariant_model.cuda()
        config = get_PPO_config(1234)
        trainer = ppo.PPOTrainer(config=config)
        trainer.restore(path1)
        policy = trainer.get_policy()
        sequential_nn = convert_ray_policy_to_sequential(
            policy)  # load the agent model
        sequential_nn.cuda()

        model = sequential_nn
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.get("lr", 1e-3))
        loss = RetrainLoss(invariant_model)  # torch.nn.MSELoss()

        self.models, self.optimizer, self.criterion = self.register(
            models=[model, invariant_model],
            optimizers=optimizer,
            criterion=loss)
        self.model = self.models[0]
        self.register_data(train_loader=train_loader,
                           validation_loader=val_loader)
Example #7
0
import csv
import os

import numpy as np
import ray
import ray.rllib.agents.ppo as ppo
import torch.nn

from environment.stopping_car import StoppingCar
from training.ppo.tune.tune_train_PPO_car import get_PPO_config
from training.ray_utils import convert_ray_policy_to_sequential

ray.init()
# config, trainer = get_PPO_trainer(use_gpu=0)

config = get_PPO_config(1234)
trainer = ppo.PPOTrainer(config=config)
# trainer.restore("/home/edoardo/ray_results/PPO_StoppingCar_2020-12-30_17-06-3265yz3d63/checkpoint_65/checkpoint-65") # 5e-2 ~19.8 delta x
# trainer.restore("/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_e6ed1_00000_0_cost_fn=0_2021-01-15_19-57-40/checkpoint_440/checkpoint-440")
# trainer.restore("/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_c1c7e_00006_6_cost_fn=0,epsilon_input=0_2021-01-17_12-44-54/checkpoint_41/checkpoint-41")
trainer.restore(
    "/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_acc24_00001_1_cost_fn=0,epsilon_input=0_2021-01-21_02-30-49/checkpoint_58/checkpoint-58"
)
# trainer.restore("/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_c1c7e_00005_5_cost_fn=0,epsilon_input=0.1_2021-01-17_12-41-27/checkpoint_10/checkpoint-10")
policy = trainer.get_policy()
sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
l0 = torch.nn.Linear(6, 2, bias=False)
l0.weight = torch.nn.Parameter(
    torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]],
                 dtype=torch.float32))
layers = [l0]
Example #8
0
            # _, predicted = torch.max(output.data, 1)

        num_samples = targets.size(0)
        num_correct = 0  # todo find a good value (predicted == target).sum().item()
        return {
            "val_loss": loss.item(),
            "val_accuracy": num_correct / num_samples,
            NUM_SAMPLES: num_samples
        }


enable_training = True
path1 = "/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_acc24_00001_1_cost_fn=0,epsilon_input=0_2021-01-21_02-30-49/checkpoint_58/checkpoint-58"
# path1 = "/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_c1c7e_00005_5_cost_fn=0,epsilon_input=0.1_2021-01-17_12-41-27/checkpoint_10/checkpoint-10"
val_data = TrainedPolicyDataset(path1, size=(0, 0), seed=4567, traces=False)
config = get_PPO_config(1234, use_gpu=0)
trainer = ppo.PPOTrainer(config=config)
trainer.restore(path1)
policy = trainer.get_policy()
sequential_nn = convert_ray_policy_to_sequential(policy).cpu()

if enable_training:
    trainer1 = TorchTrainer(
        training_operator_cls=SafetyTrainingOperator,
        num_workers=1,
        use_gpu=True,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 1024,  # used in data_creator
            "path": path1,  # path to load the agent nn