def __init__(self, render, sim_config, exp_config, checkpoint_path): import ray from ray.tune import run_experiments from ray.tune.registry import register_env from ray import tune import yaml from SingleLaneIDM.SimulatorCode.main_env import Wrapper from ray.rllib.agents.ppo.ppo import PPOAgent import os import pickle if render == 1: sim_config["config"]["render"] = True else: sim_config["config"]["render"] = False sim_config["config"]["acc-noise"] = False exp_name = list(exp_config.keys())[0] exp_config[exp_name]["config"]["num_gpus"] = 0 exp_config[exp_name]["config"]["num_workers"] = 1 exp_config[exp_name]["config"]["num_envs_per_worker"] = 1 env_creator_name = "tsim-v0" register_env(env_creator_name, lambda config: Wrapper(sim_config)) ray.init() self.agent = PPOAgent(env="tsim-v0", config=exp_config[exp_name]["config"]) self.agent.restore(checkpoint_path)
class PPORLControllerWithActionProbs(): def __init__(self, render, sim_config, exp_config, checkpoint_path): import ray from ray.tune import run_experiments from ray.tune.registry import register_env from ray import tune import yaml from SingleLaneIDM.SimulatorCode.main_env import Wrapper from ray.rllib.agents.ppo.ppo import PPOAgent import os import pickle if render == 1: sim_config["config"]["render"] = True else: sim_config["config"]["render"] = False sim_config["config"]["acc-noise"] = False exp_name = list(exp_config.keys())[0] exp_config[exp_name]["config"]["num_gpus"] = 0 exp_config[exp_name]["config"]["num_workers"] = 1 exp_config[exp_name]["config"]["num_envs_per_worker"] = 1 env_creator_name = "tsim-v0" register_env(env_creator_name, lambda config: Wrapper(sim_config)) ray.init() self.agent = PPOAgent(env="tsim-v0", config=exp_config[exp_name]["config"]) self.agent.restore(checkpoint_path) #self.agent.optimizer.foreach_evaluator(lambda ev: ev.for_policy(lambda pi:pi.set_epsilon(0.0), policy_id="default")) def getAction(self, observation): state = [] preprocessed = self.agent.local_evaluator.preprocessors[ "default"].transform(observation) filtered_obs = self.agent.local_evaluator.filters['default']( preprocessed, update=False) result = self.agent.get_policy('default').compute_single_action( filtered_obs, state, None, None, None, clip_actions=self.agent.config["clip_actions"]) action = result[0] probs = softmax(result[2]["logits"]) # imported from scipy.special return action, probs
class PPORLController(): def __init__(self, render, sim_config, exp_config, checkpoint_path): import ray from ray.tune import run_experiments from ray.tune.registry import register_env from ray import tune import yaml from SingleLaneIDM.SimulatorCode.main_env import Wrapper from ray.rllib.agents.ppo.ppo import PPOAgent import os import pickle if render == 1: sim_config["config"]["render"] = True else: sim_config["config"]["render"] = False sim_config["config"]["acc-noise"] = False exp_name = list(exp_config.keys())[0] exp_config[exp_name]["config"]["num_gpus"] = 0 exp_config[exp_name]["config"]["num_workers"] = 1 exp_config[exp_name]["config"]["num_envs_per_worker"] = 1 env_creator_name = "tsim-v0" register_env(env_creator_name, lambda config: Wrapper(sim_config)) ray.init() self.agent = PPOAgent(env="tsim-v0", config=exp_config[exp_name]["config"]) self.agent.restore(checkpoint_path) #self.agent.optimizer.foreach_evaluator(lambda ev: ev.for_policy(lambda pi:pi.set_epsilon(0.0), policy_id="default")) def getAction(self, state): action = self.agent.compute_action(state) #print(action) return action
# def optimizer(self): # # return tf.train.GradientDescentOptimizer(0.001) # return tf.train.AdamOptimizer() # def _train(self): # prev_steps = self.optimizer.num_steps_sampled # # start = time.time() # # while time.time() - start < self.config["min_iter_time_s"]: # self.optimizer.step() # result = self.optimizer.collect_metrics() # result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - # prev_steps) # return result # PGAgent._policy_graph = NewPolicyGraph # A2CAgent._policy_graph = NewPolicyGraph # A2CAgent._train = _train # agent = PGAgent(config=config, env="PointEnv") # agent = A2CAgent(config=config, env="PointEnv") agent = PPOAgent(config=config, env="PointEnv") # res = agent.train() for i in range(1000): print(i) res = agent.train() print( pretty_print({ key: value for key, value in res.items() if key.startswith("episode") }))
"dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}), } def policy_mapping_fn(agent_id): if agent_id % 2 == 0: return "ppo_policy" else: return "dqn_policy" ppo_trainer = PPOAgent( env="multi_cartpole", config={ "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["ppo_policy"], }, "simple_optimizer": True, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "NoFilter", }) dqn_trainer = DQNAgent(env="multi_cartpole", config={ "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "gamma": 0.95,
"ppo_policy": (PPOPolicyGraph, obs_space, act_space, {}), "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}), } def policy_mapping_fn(agent_id): if agent_id % 2 == 0: return "ppo_policy" else: return "dqn_policy" ppo_trainer = PPOAgent( env="multi_cartpole", config={ "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["ppo_policy"], }, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "NoFilter", }) dqn_trainer = DQNAgent( env="multi_cartpole", config={ "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "gamma": 0.95,